/*
 *  compiler/codegen_x86.cpp - IA-32 code generator
 *
 *  Original 68040 JIT compiler for UAE, copyright 2000-2002 Bernd Meyer
 *
 *  Adaptation for Basilisk II and improvements, copyright 2000-2005
 *    Gwenole Beauchesne
 *
 *  Basilisk II (C) 1997-2008 Christian Bauer
 *
 *  Portions related to CPU detection come from linux/arch/i386/kernel/setup.c
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/* This should eventually end up in machdep/, but for now, x86 is the
   only target, and it's easier this way... */

#include "flags_x86.h"

/*************************************************************************
* Some basic information about the the target CPU                       *
*************************************************************************/

#define EAX_INDEX 0
#define ECX_INDEX 1
#define EDX_INDEX 2
#define EBX_INDEX 3
#define ESP_INDEX 4
#define EBP_INDEX 5
#define ESI_INDEX 6
#define EDI_INDEX 7
#if defined (__x86_64__)
    #define R8_INDEX 8
    #define R9_INDEX 9
    #define R10_INDEX 10
    #define R11_INDEX 11
    #define R12_INDEX 12
    #define R13_INDEX 13
    #define R14_INDEX 14
    #define R15_INDEX 15
#endif
/* XXX this has to match X86_Reg8H_Base + 4 */
#define AH_INDEX (0x10 + 4 + EAX_INDEX)
#define CH_INDEX (0x10 + 4 + ECX_INDEX)
#define DH_INDEX (0x10 + 4 + EDX_INDEX)
#define BH_INDEX (0x10 + 4 + EBX_INDEX)

/* The register in which subroutines return an integer return value */
#define REG_RESULT EAX_INDEX

/* The registers subroutines take their first and second argument in */
#if defined (_MSC_VER) && !defined (USE_NORMAL_CALLING_CONVENTION)
/* Handle the _fastcall parameters of ECX and EDX */
    #define REG_PAR1 ECX_INDEX
    #define REG_PAR2 EDX_INDEX
#elif defined (__x86_64__)
    #define REG_PAR1 EDI_INDEX
    #define REG_PAR2 ESI_INDEX
#else
    #define REG_PAR1 EAX_INDEX
    #define REG_PAR2 EDX_INDEX
#endif

#define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */
#if defined (_MSC_VER) && !defined (USE_NORMAL_CALLING_CONVENTION)
    #define REG_PC_TMP EAX_INDEX
#else
    #define REG_PC_TMP ECX_INDEX /* Another register that is not the above */
#endif

#define SHIFTCOUNT_NREG ECX_INDEX  /* Register that can be used for shiftcount.
                                      -1 if any reg will do */
#define MUL_NREG1 EAX_INDEX /* %eax will hold the low 32 bits after a 32x32 mul */
#define MUL_NREG2 EDX_INDEX /* %edx will hold the high 32 bits */

#define STACK_ALIGN 16
#define STACK_OFFSET sizeof(void*)

uae_s8 always_used[] = { 4, -1 };
#if defined (__x86_64__)
uae_s8 can_byte[] = { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 };
uae_s8 can_word[] = { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 };
#else
uae_s8 can_byte[] = { 0, 1, 2, 3, -1 };
uae_s8 can_word[] = { 0, 1, 2, 3, 5, 6, 7, -1 };
#endif

#if USE_OPTIMIZED_CALLS
/* Make sure interpretive core does not use cpuopti */
uae_u8 call_saved[] = { 0, 0, 0, 1, 1, 1, 1, 1 };
    #error FIXME: code not ready
#else
/* cpuopti mutate instruction handlers to assume registers are saved
   by the caller */
uae_u8 call_saved[] = { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
#endif

/* This *should* be the same as call_saved. But:
   - We might not really know which registers are saved, and which aren't,
     so we need to preserve some, but don't want to rely on everyone else
     also saving those registers
   - Special registers (such like the stack pointer) should not be "preserved"
     by pushing, even though they are "saved" across function calls
 */
#if defined (__x86_64__)
/* callee-saved registers as defined by Linux AMD64 ABI: rbx, rbp, rsp, r12 - r15 */
/* preserve r11 because it's generally used to hold pointers to functions */
static const uae_u8 need_to_preserve[] = { 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 };
#else
/* callee-saved registers as defined by System V IA-32 ABI: edi, esi, ebx, ebp */
static const uae_u8 need_to_preserve[] = { 0, 0, 0, 1, 0, 1, 1, 1 };
#endif

/* Whether classes of instructions do or don't clobber the native flags */
#define CLOBBER_MOV
#define CLOBBER_LEA
#define CLOBBER_CMOV
#define CLOBBER_POP
#define CLOBBER_PUSH
#define CLOBBER_SUB clobber_flags()
#define CLOBBER_SBB clobber_flags()
#define CLOBBER_CMP clobber_flags()
#define CLOBBER_ADD clobber_flags()
#define CLOBBER_ADC clobber_flags()
#define CLOBBER_AND clobber_flags()
#define CLOBBER_OR clobber_flags()
#define CLOBBER_XOR clobber_flags()

#define CLOBBER_ROL clobber_flags()
#define CLOBBER_ROR clobber_flags()
#define CLOBBER_SHLL clobber_flags()
#define CLOBBER_SHRL clobber_flags()
#define CLOBBER_SHRA clobber_flags()
#define CLOBBER_TEST clobber_flags()
#define CLOBBER_CL16
#define CLOBBER_CL8
#define CLOBBER_SE32
#define CLOBBER_SE16
#define CLOBBER_SE8
#define CLOBBER_ZE32
#define CLOBBER_ZE16
#define CLOBBER_ZE8
#define CLOBBER_SW16 clobber_flags()
#define CLOBBER_SW32
#define CLOBBER_SETCC
#define CLOBBER_MUL clobber_flags()
#define CLOBBER_BT clobber_flags()
#define CLOBBER_BSF clobber_flags()

/* The older code generator is now deprecated.  */
#define USE_NEW_RTASM 1

#if USE_NEW_RTASM

    #if defined (__x86_64__)
        #define X86_TARGET_64BIT 1
/* The address override prefix causes a 5 cycles penalty on Intel Core
   processors. Another solution would be to decompose the load in an LEA,
   MOV (to zero-extend), MOV (from memory): is it better? */
        #define ADDR32 x86_emit_byte(0x67),
    #else
        #define ADDR32          /**/
    #endif
    #define X86_FLAT_REGISTERS 0
    #define X86_OPTIMIZE_ALU 1
    #define X86_OPTIMIZE_ROTSHI 1
    #include "codegen_x86.h"

    #define x86_emit_byte(B)        emit_byte(B)
    #define x86_emit_word(W)        emit_word(W)
    #define x86_emit_long(L)        emit_long(L)
    #define x86_emit_quad(Q)        emit_quad(Q)
    #define x86_get_target()        get_target()
    #define x86_emit_failure(MSG)   jit_fail(MSG, __FILE__, __LINE__, __FUNCTION__)

static void jit_fail(const char* msg, const char* file, int line, const char* function)
{
    fprintf(stderr, "JIT failure in function %s from file %s at line %d: %s\n",
            function, file, line, msg);
    abort();
}

LOWFUNC(NONE, WRITE, 1, raw_push_l_r, (R4 r))
{
    #if defined (__x86_64__)
    PUSHQr(r);
    #else
    PUSHLr(r);
    #endif
}
LENDFUNC(NONE, WRITE, 1, raw_push_l_r, (R4 r))

LOWFUNC(NONE, READ, 1, raw_pop_l_r, (R4 r))
{
    #if defined (__x86_64__)
    POPQr(r);
    #else
    POPLr(r);
    #endif
}
LENDFUNC(NONE, READ, 1, raw_pop_l_r, (R4 r))

LOWFUNC(NONE, READ, 1, raw_pop_l_m, (MEMW d))
{
    #if defined (__x86_64__)
    POPQm(d, X86_NOREG, X86_NOREG, 1);
    #else
    POPLm(d, X86_NOREG, X86_NOREG, 1);
    #endif
}
LENDFUNC(NONE, READ, 1, raw_pop_l_m, (MEMW d))

LOWFUNC(WRITE, NONE, 2, raw_bt_l_ri, (R4 r, IMM i))
{
    BTLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_bt_l_ri, (R4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_bt_l_rr, (R4 r, R4 b))
{
    BTLrr(b, r);
}
LENDFUNC(WRITE, NONE, 2, raw_bt_l_rr, (R4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_btc_l_ri, (RW4 r, IMM i))
{
    BTCLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_btc_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_btc_l_rr, (RW4 r, R4 b))
{
    BTCLrr(b, r);
}
LENDFUNC(WRITE, NONE, 2, raw_btc_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_btr_l_ri, (RW4 r, IMM i))
{
    BTRLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_btr_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_btr_l_rr, (RW4 r, R4 b))
{
    BTRLrr(b, r);
}
LENDFUNC(WRITE, NONE, 2, raw_btr_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_bts_l_ri, (RW4 r, IMM i))
{
    BTSLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_bts_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_bts_l_rr, (RW4 r, R4 b))
{
    BTSLrr(b, r);
}
LENDFUNC(WRITE, NONE, 2, raw_bts_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_sub_w_ri, (RW2 d, IMM i))
{
    SUBWir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_w_ri, (RW2 d, IMM i))

LOWFUNC(NONE, READ, 2, raw_mov_l_rm, (W4 d, MEMR s))
{
    MOVLmr(s, X86_NOREG, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 2, raw_mov_l_rm, (W4 d, MEMR s))

LOWFUNC(NONE, WRITE, 2, raw_mov_l_mi, (MEMW d, IMM s))
{
    MOVLim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_l_mi, (MEMW d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_w_mi, (MEMW d, IMM s))
{
    MOVWim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_w_mi, (MEMW d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_b_mi, (MEMW d, IMM s))
{
    MOVBim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_b_mi, (MEMW d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_rol_b_mi, (MEMRW d, IMM i))
{
    ROLBim(i, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, RMW, 2, raw_rol_b_mi, (MEMRW d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_b_ri, (RW1 r, IMM i))
{
    ROLBir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_w_ri, (RW2 r, IMM i))
{
    ROLWir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_l_ri, (RW4 r, IMM i))
{
    ROLLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_l_rr, (RW4 d, R1 r))
{
    ROLLrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_rol_w_rr, (RW2 d, R1 r))
{
    ROLWrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_rol_b_rr, (RW1 d, R1 r))
{
    ROLBrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_l_rr, (RW4 d, R1 r))
{
    SHLLrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_w_rr, (RW2 d, R1 r))
{
    SHLWrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_b_rr, (RW1 d, R1 r))
{
    SHLBrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_b_ri, (RW1 r, IMM i))
{
    RORBir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_w_ri, (RW2 r, IMM i))
{
    RORWir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, READ, 2, raw_or_l_rm, (RW4 d, MEMR s))
{
    ORLmr(s, X86_NOREG, X86_NOREG, 1, d);
}
LENDFUNC(WRITE, READ, 2, raw_or_l_rm, (RW4 d, MEMR s))

LOWFUNC(WRITE, NONE, 2, raw_ror_l_ri, (RW4 r, IMM i))
{
    RORLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_l_rr, (RW4 d, R1 r))
{
    RORLrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_w_rr, (RW2 d, R1 r))
{
    RORWrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_b_rr, (RW1 d, R1 r))
{
    RORBrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_l_rr, (RW4 d, R1 r))
{
    SHRLrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_w_rr, (RW2 d, R1 r))
{
    SHRWrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_b_rr, (RW1 d, R1 r))
{
    SHRBrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_l_rr, (RW4 d, R1 r))
{
    SARLrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_w_rr, (RW2 d, R1 r))
{
    SARWrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_b_rr, (RW1 d, R1 r))
{
    SARBrr(r, d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_l_ri, (RW4 r, IMM i))
{
    SHLLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shll_w_ri, (RW2 r, IMM i))
{
    SHLWir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shll_b_ri, (RW1 r, IMM i))
{
    SHLBir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_l_ri, (RW4 r, IMM i))
{
    SHRLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_w_ri, (RW2 r, IMM i))
{
    SHRWir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_b_ri, (RW1 r, IMM i))
{
    SHRBir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_l_ri, (RW4 r, IMM i))
{
    SARLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_w_ri, (RW2 r, IMM i))
{
    SARWir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_b_ri, (RW1 r, IMM i))
{
    SARBir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 1, raw_sahf, (R2 dummy_ah))
{
    SAHF();
}
LENDFUNC(WRITE, NONE, 1, raw_sahf, (R2 dummy_ah))

LOWFUNC(NONE, NONE, 1, raw_cpuid, (R4 dummy_eax))
{
    CPUID();
}
LENDFUNC(NONE, NONE, 1, raw_cpuid, (R4 dummy_eax))

LOWFUNC(READ, NONE, 1, raw_lahf, (W2 dummy_ah))
{
    LAHF();
}
LENDFUNC(READ, NONE, 1, raw_lahf, (W2 dummy_ah))

LOWFUNC(READ, NONE, 2, raw_setcc, (W1 d, IMM cc))
{
    SETCCir(cc, d);
}
LENDFUNC(READ, NONE, 2, raw_setcc, (W1 d, IMM cc))

LOWFUNC(READ, WRITE, 2, raw_setcc_m, (MEMW d, IMM cc))
{
    SETCCim(cc, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(READ, WRITE, 2, raw_setcc_m, (MEMW d, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_b_rr, (RW1 d, R1 s, IMM cc))
{
    /* replacement using branch and mov */
    int8* target_p = (int8*)x86_get_target() + 1;
    JCCSii(cc ^ 1, 0);
    MOVBrr(s, d);
    *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
}
LENDFUNC(READ, NONE, 3, raw_cmov_b_rr, (RW1 d, R1 s, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_w_rr, (RW2 d, R2 s, IMM cc))
{
    if (have_cmov)
        CMOVWrr(cc, s, d);
    else   /* replacement using branch and mov */
    {
        int8* target_p = (int8*)x86_get_target() + 1;
        JCCSii(cc ^ 1, 0);
        MOVWrr(s, d);
        *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
    }
}
LENDFUNC(READ, NONE, 3, raw_cmov_w_rr, (RW2 d, R2 s, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_l_rr, (RW4 d, R4 s, IMM cc))
{
    if (have_cmov)
        CMOVLrr(cc, s, d);
    else   /* replacement using branch and mov */
    {
        int8* target_p = (int8*)x86_get_target() + 1;
        JCCSii(cc ^ 1, 0);
        MOVLrr(s, d);
        *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
    }
}
LENDFUNC(READ, NONE, 3, raw_cmov_l_rr, (RW4 d, R4 s, IMM cc))

LOWFUNC(WRITE, NONE, 2, raw_bsf_l_rr, (W4 d, R4 s))
{
    BSFLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_bsf_l_rr, (W4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_32_rr, (W4 d, R4 s))
{
    MOVSLQrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_32_rr, (W4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_16_rr, (W4 d, R2 s))
{
    MOVSWLrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_16_rr, (W4 d, R2 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_8_rr, (W4 d, R1 s))
{
    MOVSBLrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_8_rr, (W4 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_zero_extend_16_rr, (W4 d, R2 s))
{
    MOVZWLrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_zero_extend_16_rr, (W4 d, R2 s))

LOWFUNC(NONE, NONE, 2, raw_zero_extend_8_rr, (W4 d, R1 s))
{
    MOVZBLrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_zero_extend_8_rr, (W4 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_imul_32_32, (RW4 d, R4 s))
{
    IMULLrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_imul_32_32, (RW4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_imul_64_32, (RW4 d, RW4 s))
{
    if (d != MUL_NREG1 || s != MUL_NREG2)
    {
        write_log("Bad register in IMUL: d=%d, s=%d\n", d, s);
        abort();
    }
    IMULLr(s);
}
LENDFUNC(NONE, NONE, 2, raw_imul_64_32, (RW4 d, RW4 s))

LOWFUNC(NONE, NONE, 2, raw_mul_64_32, (RW4 d, RW4 s))
{
    if (d != MUL_NREG1 || s != MUL_NREG2)
    {
        write_log("Bad register in MUL: d=%d, s=%d\n", d, s);
        abort();
    }
    MULLr(s);
}
LENDFUNC(NONE, NONE, 2, raw_mul_64_32, (RW4 d, RW4 s))

LOWFUNC(NONE, NONE, 2, raw_mul_32_32, (RW4 d, R4 s))
{
    abort(); /* %^$&%^$%#^ x86! */
}
LENDFUNC(NONE, NONE, 2, raw_mul_32_32, (RW4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_mov_b_rr, (W1 d, R1 s))
{
    MOVBrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_b_rr, (W1 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_mov_w_rr, (W2 d, R2 s))
{
    MOVWrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_w_rr, (W2 d, R2 s))

LOWFUNC(NONE, READ, 4, raw_mov_l_rrm_indexed, (W4 d, R4 baser, R4 index, IMM factor))
{
    ADDR32 MOVLmr(0, baser, index, factor, d);
}
LENDFUNC(NONE, READ, 4, raw_mov_l_rrm_indexed, (W4 d, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 4, raw_mov_w_rrm_indexed, (W2 d, R4 baser, R4 index, IMM factor))
{
    ADDR32 MOVWmr(0, baser, index, factor, d);
}
LENDFUNC(NONE, READ, 4, raw_mov_w_rrm_indexed, (W2 d, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 4, raw_mov_b_rrm_indexed, (W1 d, R4 baser, R4 index, IMM factor))
{
    ADDR32 MOVBmr(0, baser, index, factor, d);
}
LENDFUNC(NONE, READ, 4, raw_mov_b_rrm_indexed, (W1 d, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, WRITE, 4, raw_mov_l_mrr_indexed, (R4 baser, R4 index, IMM factor, R4 s))
{
    ADDR32 MOVLrm(s, 0, baser, index, factor);
}
LENDFUNC(NONE, WRITE, 4, raw_mov_l_mrr_indexed, (R4 baser, R4 index, IMM factor, R4 s))

LOWFUNC(NONE, WRITE, 4, raw_mov_w_mrr_indexed, (R4 baser, R4 index, IMM factor, R2 s))
{
    ADDR32 MOVWrm(s, 0, baser, index, factor);
}
LENDFUNC(NONE, WRITE, 4, raw_mov_w_mrr_indexed, (R4 baser, R4 index, IMM factor, R2 s))

LOWFUNC(NONE, WRITE, 4, raw_mov_b_mrr_indexed, (R4 baser, R4 index, IMM factor, R1 s))
{
    ADDR32 MOVBrm(s, 0, baser, index, factor);
}
LENDFUNC(NONE, WRITE, 4, raw_mov_b_mrr_indexed, (R4 baser, R4 index, IMM factor, R1 s))

LOWFUNC(NONE, WRITE, 5, raw_mov_l_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R4 s))
{
    ADDR32 MOVLrm(s, base, baser, index, factor);
}
LENDFUNC(NONE, WRITE, 5, raw_mov_l_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R4 s))

LOWFUNC(NONE, WRITE, 5, raw_mov_w_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R2 s))
{
    ADDR32 MOVWrm(s, base, baser, index, factor);
}
LENDFUNC(NONE, WRITE, 5, raw_mov_w_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R2 s))

LOWFUNC(NONE, WRITE, 5, raw_mov_b_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R1 s))
{
    ADDR32 MOVBrm(s, base, baser, index, factor);
}
LENDFUNC(NONE, WRITE, 5, raw_mov_b_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R1 s))

LOWFUNC(NONE, READ, 5, raw_mov_l_brrm_indexed, (W4 d, IMM base, R4 baser, R4 index, IMM factor))
{
    ADDR32 MOVLmr(base, baser, index, factor, d);
}
LENDFUNC(NONE, READ, 5, raw_mov_l_brrm_indexed, (W4 d, IMM base, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 5, raw_mov_w_brrm_indexed, (W2 d, IMM base, R4 baser, R4 index, IMM factor))
{
    ADDR32 MOVWmr(base, baser, index, factor, d);
}
LENDFUNC(NONE, READ, 5, raw_mov_w_brrm_indexed, (W2 d, IMM base, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 5, raw_mov_b_brrm_indexed, (W1 d, IMM base, R4 baser, R4 index, IMM factor))
{
    ADDR32 MOVBmr(base, baser, index, factor, d);
}
LENDFUNC(NONE, READ, 5, raw_mov_b_brrm_indexed, (W1 d, IMM base, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 4, raw_mov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor))
{
    ADDR32 MOVLmr(base, X86_NOREG, index, factor, d);
}
LENDFUNC(NONE, READ, 4, raw_mov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor))

LOWFUNC(NONE, READ, 5, raw_cmov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor, IMM cond))
{
    if (have_cmov)
        ADDR32 CMOVLmr(cond, base, X86_NOREG, index, factor, d);
    else   /* replacement using branch and mov */
    {
        int8* target_p = (int8*)x86_get_target() + 1;
        JCCSii(cond ^ 1, 0);
        ADDR32 MOVLmr(base, X86_NOREG, index, factor, d);
        *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
    }
}
LENDFUNC(NONE, READ, 5, raw_cmov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor, IMM cond))

LOWFUNC(NONE, READ, 3, raw_cmov_l_rm, (W4 d, IMM mem, IMM cond))
{
    if (have_cmov)
        CMOVLmr(cond, mem, X86_NOREG, X86_NOREG, 1, d);
    else   /* replacement using branch and mov */
    {
        int8* target_p = (int8*)x86_get_target() + 1;
        JCCSii(cond ^ 1, 0);
        MOVLmr(mem, X86_NOREG, X86_NOREG, 1, d);
        *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
    }
}
LENDFUNC(NONE, READ, 3, raw_cmov_l_rm, (W4 d, IMM mem, IMM cond))

LOWFUNC(NONE, READ, 3, raw_mov_l_rR, (W4 d, R4 s, IMM offset))
{
    ADDR32 MOVLmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_rR, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_w_rR, (W2 d, R4 s, IMM offset))
{
    ADDR32 MOVWmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_rR, (W2 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_b_rR, (W1 d, R4 s, IMM offset))
{
    ADDR32 MOVBmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_rR, (W1 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_l_brR, (W4 d, R4 s, IMM offset))
{
    ADDR32 MOVLmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_brR, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_w_brR, (W2 d, R4 s, IMM offset))
{
    ADDR32 MOVWmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_brR, (W2 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_b_brR, (W1 d, R4 s, IMM offset))
{
    ADDR32 MOVBmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_brR, (W1 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_Ri, (R4 d, IMM i, IMM offset))
{
    ADDR32 MOVLim(i, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_Ri, (R4 d, IMM i, IMM offset))
{
    ADDR32 MOVWim(i, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_Ri, (R4 d, IMM i, IMM offset))
{
    ADDR32 MOVBim(i, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_Rr, (R4 d, R4 s, IMM offset))
{
    ADDR32 MOVLrm(s, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_Rr, (R4 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_Rr, (R4 d, R2 s, IMM offset))
{
    ADDR32 MOVWrm(s, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_Rr, (R4 d, R2 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_Rr, (R4 d, R1 s, IMM offset))
{
    ADDR32 MOVBrm(s, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_Rr, (R4 d, R1 s, IMM offset))

LOWFUNC(NONE, NONE, 3, raw_lea_l_brr, (W4 d, R4 s, IMM offset))
{
    LEALmr(offset, s, X86_NOREG, 1, d);
}
LENDFUNC(NONE, NONE, 3, raw_lea_l_brr, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, NONE, 5, raw_lea_l_brr_indexed, (W4 d, R4 s, R4 index, IMM factor, IMM offset))
{
    LEALmr(offset, s, index, factor, d);
}
LENDFUNC(NONE, NONE, 5, raw_lea_l_brr_indexed, (W4 d, R4 s, R4 index, IMM factor, IMM offset))

LOWFUNC(NONE, NONE, 4, raw_lea_l_rr_indexed, (W4 d, R4 s, R4 index, IMM factor))
{
    LEALmr(0, s, index, factor, d);
}
LENDFUNC(NONE, NONE, 4, raw_lea_l_rr_indexed, (W4 d, R4 s, R4 index, IMM factor))

LOWFUNC(NONE, NONE, 4, raw_lea_l_r_scaled, (W4 d, R4 index, IMM factor))
{
    LEALmr(0, X86_NOREG, index, factor, d);
}
LENDFUNC(NONE, NONE, 4, raw_lea_l_r_scaled, (W4 d, R4 index, IMM factor))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_bRr, (R4 d, R4 s, IMM offset))
{
    ADDR32 MOVLrm(s, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_bRr, (R4 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_bRr, (R4 d, R2 s, IMM offset))
{
    ADDR32 MOVWrm(s, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_bRr, (R4 d, R2 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_bRr, (R4 d, R1 s, IMM offset))
{
    ADDR32 MOVBrm(s, offset, d, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_bRr, (R4 d, R1 s, IMM offset))

LOWFUNC(NONE, NONE, 1, raw_bswap_32, (RW4 r))
{
    BSWAPLr(r);
}
LENDFUNC(NONE, NONE, 1, raw_bswap_32, (RW4 r))

LOWFUNC(WRITE, NONE, 1, raw_bswap_16, (RW2 r))
{
    ROLWir(8, r);
}
LENDFUNC(WRITE, NONE, 1, raw_bswap_16, (RW2 r))

LOWFUNC(NONE, NONE, 2, raw_mov_l_rr, (W4 d, R4 s))
{
    MOVLrr(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_l_rr, (W4 d, R4 s))

LOWFUNC(NONE, WRITE, 2, raw_mov_l_mr, (IMM d, R4 s))
{
    MOVLrm(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_l_mr, (IMM d, R4 s))

LOWFUNC(NONE, WRITE, 2, raw_mov_w_mr, (IMM d, R2 s))
{
    MOVWrm(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_w_mr, (IMM d, R2 s))

LOWFUNC(NONE, READ, 2, raw_mov_w_rm, (W2 d, IMM s))
{
    MOVWmr(s, X86_NOREG, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 2, raw_mov_w_rm, (W2 d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_b_mr, (IMM d, R1 s))
{
    MOVBrm(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_b_mr, (IMM d, R1 s))

LOWFUNC(NONE, READ, 2, raw_mov_b_rm, (W1 d, IMM s))
{
    MOVBmr(s, X86_NOREG, X86_NOREG, 1, d);
}
LENDFUNC(NONE, READ, 2, raw_mov_b_rm, (W1 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_l_ri, (W4 d, IMM s))
{
    MOVLir(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_l_ri, (W4 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_w_ri, (W2 d, IMM s))
{
    MOVWir(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_w_ri, (W2 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_b_ri, (W1 d, IMM s))
{
    MOVBir(s, d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_b_ri, (W1 d, IMM s))

LOWFUNC(RMW, RMW, 2, raw_adc_l_mi, (MEMRW d, IMM s))
{
    ADCLim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(RMW, RMW, 2, raw_adc_l_mi, (MEMRW d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_l_mi, (IMM d, IMM s))
{
    ADDLim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, RMW, 2, raw_add_l_mi, (IMM d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_w_mi, (IMM d, IMM s))
{
    ADDWim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, RMW, 2, raw_add_w_mi, (IMM d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_b_mi, (IMM d, IMM s))
{
    ADDBim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, RMW, 2, raw_add_b_mi, (IMM d, IMM s))

LOWFUNC(WRITE, NONE, 2, raw_test_l_ri, (R4 d, IMM i))
{
    TESTLir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_l_ri, (R4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_test_l_rr, (R4 d, R4 s))
{
    TESTLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_l_rr, (R4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_test_w_rr, (R2 d, R2 s))
{
    TESTWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_w_rr, (R2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_test_b_rr, (R1 d, R1 s))
{
    TESTBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_b_rr, (R1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_l_ri, (RW4 d, IMM i))
{
    XORLir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_l_ri, (RW4 d, IMM i))
{
    ANDLir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_w_ri, (RW2 d, IMM i))
{
    ANDWir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_w_ri, (RW2 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_l, (RW4 d, R4 s))
{
    ANDLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_and_w, (RW2 d, R2 s))
{
    ANDWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_and_b, (RW1 d, R1 s))
{
    ANDBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_or_l_ri, (RW4 d, IMM i))
{
    ORLir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_or_l, (RW4 d, R4 s))
{
    ORLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_or_w, (RW2 d, R2 s))
{
    ORWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_or_b, (RW1 d, R1 s))
{
    ORBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_b, (RW1 d, R1 s))

LOWFUNC(RMW, NONE, 2, raw_adc_l, (RW4 d, R4 s))
{
    ADCLrr(s, d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_l, (RW4 d, R4 s))

LOWFUNC(RMW, NONE, 2, raw_adc_w, (RW2 d, R2 s))
{
    ADCWrr(s, d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_w, (RW2 d, R2 s))

LOWFUNC(RMW, NONE, 2, raw_adc_b, (RW1 d, R1 s))
{
    ADCBrr(s, d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_add_l, (RW4 d, R4 s))
{
    ADDLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_add_w, (RW2 d, R2 s))
{
    ADDWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_add_b, (RW1 d, R1 s))
{
    ADDBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_l_ri, (RW4 d, IMM i))
{
    SUBLir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_sub_b_ri, (RW1 d, IMM i))
{
    SUBBir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_b_ri, (RW1 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_l_ri, (RW4 d, IMM i))
{
    ADDLir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_w_ri, (RW2 d, IMM i))
{
    ADDWir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_w_ri, (RW2 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_b_ri, (RW1 d, IMM i))
{
    ADDBir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_b_ri, (RW1 d, IMM i))

LOWFUNC(RMW, NONE, 2, raw_sbb_l, (RW4 d, R4 s))
{
    SBBLrr(s, d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_l, (RW4 d, R4 s))

LOWFUNC(RMW, NONE, 2, raw_sbb_w, (RW2 d, R2 s))
{
    SBBWrr(s, d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_w, (RW2 d, R2 s))

LOWFUNC(RMW, NONE, 2, raw_sbb_b, (RW1 d, R1 s))
{
    SBBBrr(s, d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_l, (RW4 d, R4 s))
{
    SUBLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_w, (RW2 d, R2 s))
{
    SUBWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_b, (RW1 d, R1 s))
{
    SUBBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_l, (R4 d, R4 s))
{
    CMPLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_l, (R4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_l_ri, (R4 r, IMM i))
{
    CMPLir(i, r);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_l_ri, (R4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_cmp_w, (R2 d, R2 s))
{
    CMPWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_w, (R2 d, R2 s))

LOWFUNC(WRITE, READ, 2, raw_cmp_b_mi, (MEMR d, IMM s))
{
    CMPBim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_b_ri, (R1 d, IMM i))
{
    CMPBir(i, d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_b_ri, (R1 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_cmp_b, (R1 d, R1 s))
{
    CMPBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_b, (R1 d, R1 s))

LOWFUNC(WRITE, READ, 4, raw_cmp_l_rm_indexed, (R4 d, IMM offset, R4 index, IMM factor))
{
    ADDR32 CMPLmr(offset, X86_NOREG, index, factor, d);
}
LENDFUNC(WRITE, READ, 4, raw_cmp_l_rm_indexed, (R4 d, IMM offset, R4 index, IMM factor))

LOWFUNC(WRITE, NONE, 2, raw_xor_l, (RW4 d, R4 s))
{
    XORLrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_w, (RW2 d, R2 s))
{
    XORWrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_b, (RW1 d, R1 s))
{
    XORBrr(s, d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_b, (RW1 d, R1 s))

LOWFUNC(WRITE, RMW, 2, raw_sub_l_mi, (MEMRW d, IMM s))
{
    SUBLim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, RMW, 2, raw_sub_l_mi, (MEMRW d, IMM s))

LOWFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))
{
    CMPLim(s, d, X86_NOREG, X86_NOREG, 1);
}
LENDFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))
{
    XCHGLrr(r2, r1);
}
LENDFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))

LOWFUNC(NONE, NONE, 2, raw_xchg_b_rr, (RW4 r1, RW4 r2))
{
    XCHGBrr(r2, r1);
}
LENDFUNC(NONE, NONE, 2, raw_xchg_b_rr, (RW4 r1, RW4 r2))

LOWFUNC(READ, WRITE, 0, raw_pushfl, (void))
{
    PUSHF();
}
LENDFUNC(READ, WRITE, 0, raw_pushfl, (void))

LOWFUNC(WRITE, READ, 0, raw_popfl, (void))
{
    POPF();
}
LENDFUNC(WRITE, READ, 0, raw_popfl, (void))

/* Generate floating-point instructions */
static inline void x86_fadd_m(MEMR s)
{
    FADDDm(s, X86_NOREG, X86_NOREG, 1);
}

#else

const bool optimize_accum = true;
const bool optimize_imm8 = true;
const bool optimize_shift_once = true;

/*************************************************************************
* Actual encoding of the instructions on the target CPU                 *
*************************************************************************/

static __inline__ int isaccum(int r)
{
    return (r == EAX_INDEX);
}

static __inline__ int isbyte(uae_s32 x)
{
    return (x >= -128 && x <= 127);
}

static __inline__ int isword(uae_s32 x)
{
    return (x >= -32768 && x <= 32767);
}

LOWFUNC(NONE, WRITE, 1, raw_push_l_r, (R4 r))
{
    emit_byte(0x50 + r);
}
LENDFUNC(NONE, WRITE, 1, raw_push_l_r, (R4 r))

LOWFUNC(NONE, READ, 1, raw_pop_l_r, (R4 r))
{
    emit_byte(0x58 + r);
}
LENDFUNC(NONE, READ, 1, raw_pop_l_r, (R4 r))

LOWFUNC(NONE, READ, 1, raw_pop_l_m, (MEMW d))
{
    emit_byte(0x8f);
    emit_byte(0x05);
    emit_long(d);
}
LENDFUNC(NONE, READ, 1, raw_pop_l_m, (MEMW d))

LOWFUNC(WRITE, NONE, 2, raw_bt_l_ri, (R4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xe0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_bt_l_ri, (R4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_bt_l_rr, (R4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xa3);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_bt_l_rr, (R4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_btc_l_ri, (RW4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xf8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_btc_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_btc_l_rr, (RW4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xbb);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_btc_l_rr, (RW4 r, R4 b))


LOWFUNC(WRITE, NONE, 2, raw_btr_l_ri, (RW4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xf0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_btr_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_btr_l_rr, (RW4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xb3);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_btr_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_bts_l_ri, (RW4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xe8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_bts_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_bts_l_rr, (RW4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xab);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_bts_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_sub_w_ri, (RW2 d, IMM i))
{
    emit_byte(0x66);
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xe8 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x2d);
        else
        {
            emit_byte(0x81);
            emit_byte(0xe8 + d);
        }
        emit_word(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_sub_w_ri, (RW2 d, IMM i))


LOWFUNC(NONE, READ, 2, raw_mov_l_rm, (W4 d, MEMR s))
{
    emit_byte(0x8b);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(NONE, READ, 2, raw_mov_l_rm, (W4 d, MEMR s))

LOWFUNC(NONE, WRITE, 2, raw_mov_l_mi, (MEMW d, IMM s))
{
    emit_byte(0xc7);
    emit_byte(0x05);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_l_mi, (MEMW d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_w_mi, (MEMW d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0xc7);
    emit_byte(0x05);
    emit_long(d);
    emit_word(s);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_w_mi, (MEMW d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_b_mi, (MEMW d, IMM s))
{
    emit_byte(0xc6);
    emit_byte(0x05);
    emit_long(d);
    emit_byte(s);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_b_mi, (MEMW d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_rol_b_mi, (MEMRW d, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd0);
        emit_byte(0x05);
        emit_long(d);
    }
    else
    {
        emit_byte(0xc0);
        emit_byte(0x05);
        emit_long(d);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, RMW, 2, raw_rol_b_mi, (MEMRW d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_b_ri, (RW1 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd0);
        emit_byte(0xc0 + r);
    }
    else
    {
        emit_byte(0xc0);
        emit_byte(0xc0 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_rol_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xc0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_l_ri, (RW4 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd1);
        emit_byte(0xc0 + r);
    }
    else
    {
        emit_byte(0xc1);
        emit_byte(0xc0 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_rol_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xc0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_rol_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xc0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_rol_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xc0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xe0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xe0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xe0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_b_ri, (RW1 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd0);
        emit_byte(0xc8 + r);
    }
    else
    {
        emit_byte(0xc0);
        emit_byte(0xc8 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_ror_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xc8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_w_ri, (RW2 r, IMM i))

// gb-- used for making an fpcr value in compemu_fpp.cpp
LOWFUNC(WRITE, READ, 2, raw_or_l_rm, (RW4 d, MEMR s))
{
    emit_byte(0x0b);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(WRITE, READ, 2, raw_or_l_rm, (RW4 d, MEMR s))

LOWFUNC(WRITE, NONE, 2, raw_ror_l_ri, (RW4 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd1);
        emit_byte(0xc8 + r);
    }
    else
    {
        emit_byte(0xc1);
        emit_byte(0xc8 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_ror_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xc8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xc8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xc8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xe8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xe8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xe8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xf8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xf8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xf8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_l_ri, (RW4 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd1);
        emit_byte(0xe0 + r);
    }
    else
    {
        emit_byte(0xc1);
        emit_byte(0xe0 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_shll_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shll_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xe0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shll_b_ri, (RW1 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd0);
        emit_byte(0xe0 + r);
    }
    else
    {
        emit_byte(0xc0);
        emit_byte(0xe0 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_shll_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_l_ri, (RW4 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd1);
        emit_byte(0xe8 + r);
    }
    else
    {
        emit_byte(0xc1);
        emit_byte(0xe8 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xe8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_b_ri, (RW1 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd0);
        emit_byte(0xe8 + r);
    }
    else
    {
        emit_byte(0xc0);
        emit_byte(0xe8 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_l_ri, (RW4 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd1);
        emit_byte(0xf8 + r);
    }
    else
    {
        emit_byte(0xc1);
        emit_byte(0xf8 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_shra_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xf8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_b_ri, (RW1 r, IMM i))
{
    if (optimize_shift_once && (i == 1))
    {
        emit_byte(0xd0);
        emit_byte(0xf8 + r);
    }
    else
    {
        emit_byte(0xc0);
        emit_byte(0xf8 + r);
        emit_byte(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_shra_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 1, raw_sahf, (R2 dummy_ah))
{
    emit_byte(0x9e);
}
LENDFUNC(WRITE, NONE, 1, raw_sahf, (R2 dummy_ah))

LOWFUNC(NONE, NONE, 1, raw_cpuid, (R4 dummy_eax))
{
    emit_byte(0x0f);
    emit_byte(0xa2);
}
LENDFUNC(NONE, NONE, 1, raw_cpuid, (R4 dummy_eax))

LOWFUNC(READ, NONE, 1, raw_lahf, (W2 dummy_ah))
{
    emit_byte(0x9f);
}
LENDFUNC(READ, NONE, 1, raw_lahf, (W2 dummy_ah))

LOWFUNC(READ, NONE, 2, raw_setcc, (W1 d, IMM cc))
{
    emit_byte(0x0f);
    emit_byte(0x90 + cc);
    emit_byte(0xc0 + d);
}
LENDFUNC(READ, NONE, 2, raw_setcc, (W1 d, IMM cc))

LOWFUNC(READ, WRITE, 2, raw_setcc_m, (MEMW d, IMM cc))
{
    emit_byte(0x0f);
    emit_byte(0x90 + cc);
    emit_byte(0x05);
    emit_long(d);
}
LENDFUNC(READ, WRITE, 2, raw_setcc_m, (MEMW d, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_b_rr, (RW1 d, R1 s, IMM cc))
{
    /* replacement using branch and mov */
    int uncc = (cc ^ 1);
    emit_byte(0x70 + uncc);
    emit_byte(3);  /* skip next 2 bytes if not cc=true */
    emit_byte(0x88);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(READ, NONE, 3, raw_cmov_b_rr, (RW1 d, R1 s, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_w_rr, (RW2 d, R2 s, IMM cc))
{
    if (have_cmov)
    {
        emit_byte(0x66);
        emit_byte(0x0f);
        emit_byte(0x40 + cc);
        emit_byte(0xc0 + 8 * d + s);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cc ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(3); /* skip next 3 bytes if not cc=true */
        emit_byte(0x66);
        emit_byte(0x89);
        emit_byte(0xc0 + 8 * s + d);
    }
}
LENDFUNC(READ, NONE, 3, raw_cmov_w_rr, (RW2 d, R2 s, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_l_rr, (RW4 d, R4 s, IMM cc))
{
    if (have_cmov)
    {
        emit_byte(0x0f);
        emit_byte(0x40 + cc);
        emit_byte(0xc0 + 8 * d + s);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cc ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(2); /* skip next 2 bytes if not cc=true */
        emit_byte(0x89);
        emit_byte(0xc0 + 8 * s + d);
    }
}
LENDFUNC(READ, NONE, 3, raw_cmov_l_rr, (RW4 d, R4 s, IMM cc))

LOWFUNC(WRITE, NONE, 2, raw_bsf_l_rr, (W4 d, R4 s))
{
    emit_byte(0x0f);
    emit_byte(0xbc);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(WRITE, NONE, 2, raw_bsf_l_rr, (W4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_16_rr, (W4 d, R2 s))
{
    emit_byte(0x0f);
    emit_byte(0xbf);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_16_rr, (W4 d, R2 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_8_rr, (W4 d, R1 s))
{
    emit_byte(0x0f);
    emit_byte(0xbe);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_8_rr, (W4 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_zero_extend_16_rr, (W4 d, R2 s))
{
    emit_byte(0x0f);
    emit_byte(0xb7);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_zero_extend_16_rr, (W4 d, R2 s))

LOWFUNC(NONE, NONE, 2, raw_zero_extend_8_rr, (W4 d, R1 s))
{
    emit_byte(0x0f);
    emit_byte(0xb6);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_zero_extend_8_rr, (W4 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_imul_32_32, (RW4 d, R4 s))
{
    emit_byte(0x0f);
    emit_byte(0xaf);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_imul_32_32, (RW4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_imul_64_32, (RW4 d, RW4 s))
{
    if (d != MUL_NREG1 || s != MUL_NREG2)
        abort();
    emit_byte(0xf7);
    emit_byte(0xea);
}
LENDFUNC(NONE, NONE, 2, raw_imul_64_32, (RW4 d, RW4 s))

LOWFUNC(NONE, NONE, 2, raw_mul_64_32, (RW4 d, RW4 s))
{
    if (d != MUL_NREG1 || s != MUL_NREG2)
    {
        printf("Bad register in MUL: d=%d, s=%d\n", d, s);
        abort();
    }
    emit_byte(0xf7);
    emit_byte(0xe2);
}
LENDFUNC(NONE, NONE, 2, raw_mul_64_32, (RW4 d, RW4 s))

LOWFUNC(NONE, NONE, 2, raw_mul_32_32, (RW4 d, R4 s))
{
    abort(); /* %^$&%^$%#^ x86! */
    emit_byte(0x0f);
    emit_byte(0xaf);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_mul_32_32, (RW4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_mov_b_rr, (W1 d, R1 s))
{
    emit_byte(0x88);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_b_rr, (W1 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_mov_w_rr, (W2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_w_rr, (W2 d, R2 s))

LOWFUNC(NONE, READ, 4, raw_mov_l_rrm_indexed, (W4 d, R4 baser, R4 index, IMM factor))
{
    int isebp = (baser == 5) ? 0x40 : 0;
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }


    emit_byte(0x8b);
    emit_byte(0x04 + 8 * d + isebp);
    emit_byte(baser + 8 * index + 0x40 * fi);
    if (isebp)
        emit_byte(0x00);
}
LENDFUNC(NONE, READ, 4, raw_mov_l_rrm_indexed, (W4 d, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 4, raw_mov_w_rrm_indexed, (W2 d, R4 baser, R4 index, IMM factor))
{
    int fi;
    int isebp;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }
    isebp = (baser == 5) ? 0x40 : 0;

    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x04 + 8 * d + isebp);
    emit_byte(baser + 8 * index + 0x40 * fi);
    if (isebp)
        emit_byte(0x00);
}
LENDFUNC(NONE, READ, 4, raw_mov_w_rrm_indexed, (W2 d, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 4, raw_mov_b_rrm_indexed, (W1 d, R4 baser, R4 index, IMM factor))
{
    int fi;
    int isebp;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }
    isebp = (baser == 5) ? 0x40 : 0;

    emit_byte(0x8a);
    emit_byte(0x04 + 8 * d + isebp);
    emit_byte(baser + 8 * index + 0x40 * fi);
    if (isebp)
        emit_byte(0x00);
}
LENDFUNC(NONE, READ, 4, raw_mov_b_rrm_indexed, (W1 d, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, WRITE, 4, raw_mov_l_mrr_indexed, (R4 baser, R4 index, IMM factor, R4 s))
{
    int fi;
    int isebp;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }


    isebp = (baser == 5) ? 0x40 : 0;

    emit_byte(0x89);
    emit_byte(0x04 + 8 * s + isebp);
    emit_byte(baser + 8 * index + 0x40 * fi);
    if (isebp)
        emit_byte(0x00);
}
LENDFUNC(NONE, WRITE, 4, raw_mov_l_mrr_indexed, (R4 baser, R4 index, IMM factor, R4 s))

LOWFUNC(NONE, WRITE, 4, raw_mov_w_mrr_indexed, (R4 baser, R4 index, IMM factor, R2 s))
{
    int fi;
    int isebp;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }
    isebp = (baser == 5) ? 0x40 : 0;

    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x04 + 8 * s + isebp);
    emit_byte(baser + 8 * index + 0x40 * fi);
    if (isebp)
        emit_byte(0x00);
}
LENDFUNC(NONE, WRITE, 4, raw_mov_w_mrr_indexed, (R4 baser, R4 index, IMM factor, R2 s))

LOWFUNC(NONE, WRITE, 4, raw_mov_b_mrr_indexed, (R4 baser, R4 index, IMM factor, R1 s))
{
    int fi;
    int isebp;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }
    isebp = (baser == 5) ? 0x40 : 0;

    emit_byte(0x88);
    emit_byte(0x04 + 8 * s + isebp);
    emit_byte(baser + 8 * index + 0x40 * fi);
    if (isebp)
        emit_byte(0x00);
}
LENDFUNC(NONE, WRITE, 4, raw_mov_b_mrr_indexed, (R4 baser, R4 index, IMM factor, R1 s))

LOWFUNC(NONE, WRITE, 5, raw_mov_l_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R4 s))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x89);
    emit_byte(0x84 + 8 * s);
    emit_byte(baser + 8 * index + 0x40 * fi);
    emit_long(base);
}
LENDFUNC(NONE, WRITE, 5, raw_mov_l_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R4 s))

LOWFUNC(NONE, WRITE, 5, raw_mov_w_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R2 s))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x84 + 8 * s);
    emit_byte(baser + 8 * index + 0x40 * fi);
    emit_long(base);
}
LENDFUNC(NONE, WRITE, 5, raw_mov_w_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R2 s))

LOWFUNC(NONE, WRITE, 5, raw_mov_b_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R1 s))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x88);
    emit_byte(0x84 + 8 * s);
    emit_byte(baser + 8 * index + 0x40 * fi);
    emit_long(base);
}
LENDFUNC(NONE, WRITE, 5, raw_mov_b_bmrr_indexed, (IMM base, R4 baser, R4 index, IMM factor, R1 s))

LOWFUNC(NONE, READ, 5, raw_mov_l_brrm_indexed, (W4 d, IMM base, R4 baser, R4 index, IMM factor))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x8b);
    emit_byte(0x84 + 8 * d);
    emit_byte(baser + 8 * index + 0x40 * fi);
    emit_long(base);
}
LENDFUNC(NONE, READ, 5, raw_mov_l_brrm_indexed, (W4 d, IMM base, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 5, raw_mov_w_brrm_indexed, (W2 d, IMM base, R4 baser, R4 index, IMM factor))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x84 + 8 * d);
    emit_byte(baser + 8 * index + 0x40 * fi);
    emit_long(base);
}
LENDFUNC(NONE, READ, 5, raw_mov_w_brrm_indexed, (W2 d, IMM base, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 5, raw_mov_b_brrm_indexed, (W1 d, IMM base, R4 baser, R4 index, IMM factor))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x8a);
    emit_byte(0x84 + 8 * d);
    emit_byte(baser + 8 * index + 0x40 * fi);
    emit_long(base);
}
LENDFUNC(NONE, READ, 5, raw_mov_b_brrm_indexed, (W1 d, IMM base, R4 baser, R4 index, IMM factor))

LOWFUNC(NONE, READ, 4, raw_mov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor))
{
    int fi;
    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default:
            fprintf(stderr, "Bad factor %d in mov_l_rm_indexed!\n", factor);
            abort();
    }
    emit_byte(0x8b);
    emit_byte(0x04 + 8 * d);
    emit_byte(0x05 + 8 * index + 64 * fi);
    emit_long(base);
}
LENDFUNC(NONE, READ, 4, raw_mov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor))

LOWFUNC(NONE, READ, 5, raw_cmov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor, IMM cond))
{
    int fi;
    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default:
            fprintf(stderr, "Bad factor %d in mov_l_rm_indexed!\n", factor);
            abort();
    }
    if (have_cmov)
    {
        emit_byte(0x0f);
        emit_byte(0x40 + cond);
        emit_byte(0x04 + 8 * d);
        emit_byte(0x05 + 8 * index + 64 * fi);
        emit_long(base);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cond ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(7); /* skip next 7 bytes if not cc=true */
        emit_byte(0x8b);
        emit_byte(0x04 + 8 * d);
        emit_byte(0x05 + 8 * index + 64 * fi);
        emit_long(base);
    }
}
LENDFUNC(NONE, READ, 5, raw_cmov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM factor, IMM cond))

LOWFUNC(NONE, READ, 3, raw_cmov_l_rm, (W4 d, IMM mem, IMM cond))
{
    if (have_cmov)
    {
        emit_byte(0x0f);
        emit_byte(0x40 + cond);
        emit_byte(0x05 + 8 * d);
        emit_long(mem);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cond ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(6); /* skip next 6 bytes if not cc=true */
        emit_byte(0x8b);
        emit_byte(0x05 + 8 * d);
        emit_long(mem);
    }
}
LENDFUNC(NONE, READ, 3, raw_cmov_l_rm, (W4 d, IMM mem, IMM cond))

LOWFUNC(NONE, READ, 3, raw_mov_l_rR, (W4 d, R4 s, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x8b);
    emit_byte(0x40 + 8 * d + s);
    emit_byte(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_rR, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_w_rR, (W2 d, R4 s, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x40 + 8 * d + s);
    emit_byte(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_rR, (W2 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_b_rR, (W1 d, R4 s, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x8a);
    emit_byte(0x40 + 8 * d + s);
    emit_byte(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_rR, (W1 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_l_brR, (W4 d, R4 s, IMM offset))
{
    emit_byte(0x8b);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_brR, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_w_brR, (W2 d, R4 s, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_brR, (W2 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_b_brR, (W1 d, R4 s, IMM offset))
{
    emit_byte(0x8a);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_brR, (W1 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_Ri, (R4 d, IMM i, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0xc7);
    emit_byte(0x40 + d);
    emit_byte(offset);
    emit_long(i);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_Ri, (R4 d, IMM i, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x66);
    emit_byte(0xc7);
    emit_byte(0x40 + d);
    emit_byte(offset);
    emit_word(i);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_Ri, (R4 d, IMM i, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0xc6);
    emit_byte(0x40 + d);
    emit_byte(offset);
    emit_byte(i);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_Rr, (R4 d, R4 s, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x89);
    emit_byte(0x40 + 8 * s + d);
    emit_byte(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_Rr, (R4 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_Rr, (R4 d, R2 s, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x40 + 8 * s + d);
    emit_byte(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_Rr, (R4 d, R2 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_Rr, (R4 d, R1 s, IMM offset))
{
    Dif(!isbyte(offset)) abort();
    emit_byte(0x88);
    emit_byte(0x40 + 8 * s + d);
    emit_byte(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_Rr, (R4 d, R1 s, IMM offset))

LOWFUNC(NONE, NONE, 3, raw_lea_l_brr, (W4 d, R4 s, IMM offset))
{
    if (optimize_imm8 && isbyte(offset))
    {
        emit_byte(0x8d);
        emit_byte(0x40 + 8 * d + s);
        emit_byte(offset);
    }
    else
    {
        emit_byte(0x8d);
        emit_byte(0x80 + 8 * d + s);
        emit_long(offset);
    }
}
LENDFUNC(NONE, NONE, 3, raw_lea_l_brr, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, NONE, 5, raw_lea_l_brr_indexed, (W4 d, R4 s, R4 index, IMM factor, IMM offset))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    if (optimize_imm8 && isbyte(offset))
    {
        emit_byte(0x8d);
        emit_byte(0x44 + 8 * d);
        emit_byte(0x40 * fi + 8 * index + s);
        emit_byte(offset);
    }
    else
    {
        emit_byte(0x8d);
        emit_byte(0x84 + 8 * d);
        emit_byte(0x40 * fi + 8 * index + s);
        emit_long(offset);
    }
}
LENDFUNC(NONE, NONE, 5, raw_lea_l_brr_indexed, (W4 d, R4 s, R4 index, IMM factor, IMM offset))

LOWFUNC(NONE, NONE, 4, raw_lea_l_rr_indexed, (W4 d, R4 s, R4 index, IMM factor))
{
    int isebp = (s == 5) ? 0x40 : 0;
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }

    emit_byte(0x8d);
    emit_byte(0x04 + 8 * d + isebp);
    emit_byte(0x40 * fi + 8 * index + s);
    if (isebp)
        emit_byte(0);
}
LENDFUNC(NONE, NONE, 4, raw_lea_l_rr_indexed, (W4 d, R4 s, R4 index, IMM factor))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_bRr, (R4 d, R4 s, IMM offset))
{
    if (optimize_imm8 && isbyte(offset))
    {
        emit_byte(0x89);
        emit_byte(0x40 + 8 * s + d);
        emit_byte(offset);
    }
    else
    {
        emit_byte(0x89);
        emit_byte(0x80 + 8 * s + d);
        emit_long(offset);
    }
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_bRr, (R4 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_bRr, (R4 d, R2 s, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x80 + 8 * s + d);
    emit_long(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_bRr, (R4 d, R2 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_bRr, (R4 d, R1 s, IMM offset))
{
    if (optimize_imm8 && isbyte(offset))
    {
        emit_byte(0x88);
        emit_byte(0x40 + 8 * s + d);
        emit_byte(offset);
    }
    else
    {
        emit_byte(0x88);
        emit_byte(0x80 + 8 * s + d);
        emit_long(offset);
    }
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_bRr, (R4 d, R1 s, IMM offset))

LOWFUNC(NONE, NONE, 1, raw_bswap_32, (RW4 r))
{
    emit_byte(0x0f);
    emit_byte(0xc8 + r);
}
LENDFUNC(NONE, NONE, 1, raw_bswap_32, (RW4 r))

LOWFUNC(WRITE, NONE, 1, raw_bswap_16, (RW2 r))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xc0 + r);
    emit_byte(0x08);
}
LENDFUNC(WRITE, NONE, 1, raw_bswap_16, (RW2 r))

LOWFUNC(NONE, NONE, 2, raw_mov_l_rr, (W4 d, R4 s))
{
    emit_byte(0x89);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_l_rr, (W4 d, R4 s))

LOWFUNC(NONE, WRITE, 2, raw_mov_l_mr, (IMM d, R4 s))
{
    emit_byte(0x89);
    emit_byte(0x05 + 8 * s);
    emit_long(d);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_l_mr, (IMM d, R4 s))

LOWFUNC(NONE, WRITE, 2, raw_mov_w_mr, (IMM d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x05 + 8 * s);
    emit_long(d);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_w_mr, (IMM d, R2 s))

LOWFUNC(NONE, READ, 2, raw_mov_w_rm, (W2 d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(NONE, READ, 2, raw_mov_w_rm, (W2 d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_b_mr, (IMM d, R1 s))
{
    emit_byte(0x88);
    emit_byte(0x05 + 8 * (s & 0xf)); /* XXX this handles %ah case (defined as 0x10+4) and others */
    emit_long(d);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_b_mr, (IMM d, R1 s))

LOWFUNC(NONE, READ, 2, raw_mov_b_rm, (W1 d, IMM s))
{
    emit_byte(0x8a);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(NONE, READ, 2, raw_mov_b_rm, (W1 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_l_ri, (W4 d, IMM s))
{
    emit_byte(0xb8 + d);
    emit_long(s);
}
LENDFUNC(NONE, NONE, 2, raw_mov_l_ri, (W4 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_w_ri, (W2 d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0xb8 + d);
    emit_word(s);
}
LENDFUNC(NONE, NONE, 2, raw_mov_w_ri, (W2 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_b_ri, (W1 d, IMM s))
{
    emit_byte(0xb0 + d);
    emit_byte(s);
}
LENDFUNC(NONE, NONE, 2, raw_mov_b_ri, (W1 d, IMM s))

LOWFUNC(RMW, RMW, 2, raw_adc_l_mi, (MEMRW d, IMM s))
{
    emit_byte(0x81);
    emit_byte(0x15);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(RMW, RMW, 2, raw_adc_l_mi, (MEMRW d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_l_mi, (IMM d, IMM s))
{
    if (optimize_imm8 && isbyte(s))
    {
        emit_byte(0x83);
        emit_byte(0x05);
        emit_long(d);
        emit_byte(s);
    }
    else
    {
        emit_byte(0x81);
        emit_byte(0x05);
        emit_long(d);
        emit_long(s);
    }
}
LENDFUNC(WRITE, RMW, 2, raw_add_l_mi, (IMM d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_w_mi, (IMM d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0x81);
    emit_byte(0x05);
    emit_long(d);
    emit_word(s);
}
LENDFUNC(WRITE, RMW, 2, raw_add_w_mi, (IMM d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_b_mi, (IMM d, IMM s))
{
    emit_byte(0x80);
    emit_byte(0x05);
    emit_long(d);
    emit_byte(s);
}
LENDFUNC(WRITE, RMW, 2, raw_add_b_mi, (IMM d, IMM s))

LOWFUNC(WRITE, NONE, 2, raw_test_l_ri, (R4 d, IMM i))
{
    if (optimize_accum && isaccum(d))
        emit_byte(0xa9);
    else
    {
        emit_byte(0xf7);
        emit_byte(0xc0 + d);
    }
    emit_long(i);
}
LENDFUNC(WRITE, NONE, 2, raw_test_l_ri, (R4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_test_l_rr, (R4 d, R4 s))
{
    emit_byte(0x85);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_l_rr, (R4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_test_w_rr, (R2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x85);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_w_rr, (R2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_test_b_rr, (R1 d, R1 s))
{
    emit_byte(0x84);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_b_rr, (R1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_l_ri, (RW4 d, IMM i))
{
    emit_byte(0x81);
    emit_byte(0xf0 + d);
    emit_long(i);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_l_ri, (RW4 d, IMM i))
{
    if (optimize_imm8 && isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xe0 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x25);
        else
        {
            emit_byte(0x81);
            emit_byte(0xe0 + d);
        }
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_and_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_w_ri, (RW2 d, IMM i))
{
    emit_byte(0x66);
    if (optimize_imm8 && isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xe0 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x25);
        else
        {
            emit_byte(0x81);
            emit_byte(0xe0 + d);
        }
        emit_word(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_and_w_ri, (RW2 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_l, (RW4 d, R4 s))
{
    emit_byte(0x21);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_and_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x21);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_and_b, (RW1 d, R1 s))
{
    emit_byte(0x20);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_or_l_ri, (RW4 d, IMM i))
{
    if (optimize_imm8 && isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xc8 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x0d);
        else
        {
            emit_byte(0x81);
            emit_byte(0xc8 + d);
        }
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_or_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_or_l, (RW4 d, R4 s))
{
    emit_byte(0x09);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_or_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x09);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_or_b, (RW1 d, R1 s))
{
    emit_byte(0x08);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_b, (RW1 d, R1 s))

LOWFUNC(RMW, NONE, 2, raw_adc_l, (RW4 d, R4 s))
{
    emit_byte(0x11);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_l, (RW4 d, R4 s))

LOWFUNC(RMW, NONE, 2, raw_adc_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x11);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_w, (RW2 d, R2 s))

LOWFUNC(RMW, NONE, 2, raw_adc_b, (RW1 d, R1 s))
{
    emit_byte(0x10);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_add_l, (RW4 d, R4 s))
{
    emit_byte(0x01);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_add_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x01);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_add_b, (RW1 d, R1 s))
{
    emit_byte(0x00);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_l_ri, (RW4 d, IMM i))
{
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xe8 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x2d);
        else
        {
            emit_byte(0x81);
            emit_byte(0xe8 + d);
        }
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_sub_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_sub_b_ri, (RW1 d, IMM i))
{
    if (optimize_accum && isaccum(d))
        emit_byte(0x2c);
    else
    {
        emit_byte(0x80);
        emit_byte(0xe8 + d);
    }
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_b_ri, (RW1 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_l_ri, (RW4 d, IMM i))
{
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xc0 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x05);
        else
        {
            emit_byte(0x81);
            emit_byte(0xc0 + d);
        }
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_add_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_w_ri, (RW2 d, IMM i))
{
    emit_byte(0x66);
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xc0 + d);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(d))
            emit_byte(0x05);
        else
        {
            emit_byte(0x81);
            emit_byte(0xc0 + d);
        }
        emit_word(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_add_w_ri, (RW2 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_b_ri, (RW1 d, IMM i))
{
    if (optimize_accum && isaccum(d))
        emit_byte(0x04);
    else
    {
        emit_byte(0x80);
        emit_byte(0xc0 + d);
    }
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_add_b_ri, (RW1 d, IMM i))

LOWFUNC(RMW, NONE, 2, raw_sbb_l, (RW4 d, R4 s))
{
    emit_byte(0x19);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_l, (RW4 d, R4 s))

LOWFUNC(RMW, NONE, 2, raw_sbb_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x19);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_w, (RW2 d, R2 s))

LOWFUNC(RMW, NONE, 2, raw_sbb_b, (RW1 d, R1 s))
{
    emit_byte(0x18);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_l, (RW4 d, R4 s))
{
    emit_byte(0x29);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x29);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_b, (RW1 d, R1 s))
{
    emit_byte(0x28);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_l, (R4 d, R4 s))
{
    emit_byte(0x39);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_l, (R4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_l_ri, (R4 r, IMM i))
{
    if (optimize_imm8 && isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xf8 + r);
        emit_byte(i);
    }
    else
    {
        if (optimize_accum && isaccum(r))
            emit_byte(0x3d);
        else
        {
            emit_byte(0x81);
            emit_byte(0xf8 + r);
        }
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_l_ri, (R4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_cmp_w, (R2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x39);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_w, (R2 d, R2 s))

LOWFUNC(WRITE, READ, 2, raw_cmp_b_mi, (MEMR d, IMM s))
{
    emit_byte(0x80);
    emit_byte(0x3d);
    emit_long(d);
    emit_byte(s);
}
LENDFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_b_ri, (R1 d, IMM i))
{
    if (optimize_accum && isaccum(d))
        emit_byte(0x3c);
    else
    {
        emit_byte(0x80);
        emit_byte(0xf8 + d);
    }
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_b_ri, (R1 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_cmp_b, (R1 d, R1 s))
{
    emit_byte(0x38);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_b, (R1 d, R1 s))

LOWFUNC(WRITE, READ, 4, raw_cmp_l_rm_indexed, (R4 d, IMM offset, R4 index, IMM factor))
{
    int fi;

    switch (factor)
    {
        case 1: fi = 0; break;
        case 2: fi = 1; break;
        case 4: fi = 2; break;
        case 8: fi = 3; break;
        default: abort();
    }
    emit_byte(0x39);
    emit_byte(0x04 + 8 * d);
    emit_byte(5 + 8 * index + 0x40 * fi);
    emit_long(offset);
}
LENDFUNC(WRITE, READ, 4, raw_cmp_l_rm_indexed, (R4 d, IMM offset, R4 index, IMM factor))

LOWFUNC(WRITE, NONE, 2, raw_xor_l, (RW4 d, R4 s))
{
    emit_byte(0x31);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x31);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_b, (RW1 d, R1 s))
{
    emit_byte(0x30);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_b, (RW1 d, R1 s))

LOWFUNC(WRITE, RMW, 2, raw_sub_l_mi, (MEMRW d, IMM s))
{
    if (optimize_imm8 && isbyte(s))
    {
        emit_byte(0x83);
        emit_byte(0x2d);
        emit_long(d);
        emit_byte(s);
    }
    else
    {
        emit_byte(0x81);
        emit_byte(0x2d);
        emit_long(d);
        emit_long(s);
    }
}
LENDFUNC(WRITE, RMW, 2, raw_sub_l_mi, (MEMRW d, IMM s))

LOWFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))
{
    if (optimize_imm8 && isbyte(s))
    {
        emit_byte(0x83);
        emit_byte(0x3d);
        emit_long(d);
        emit_byte(s);
    }
    else
    {
        emit_byte(0x81);
        emit_byte(0x3d);
        emit_long(d);
        emit_long(s);
    }
}
LENDFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))
{
    emit_byte(0x87);
    emit_byte(0xc0 + 8 * r1 + r2);
}
LENDFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))

LOWFUNC(NONE, NONE, 2, raw_xchg_b_rr, (RW4 r1, RW4 r2))
{
    emit_byte(0x86);
    emit_byte(0xc0 + 8 * (r1 & 0xf) + (r2 & 0xf)); /* XXX this handles upper-halves registers (e.g. %ah defined as 0x10+4) */
}
LENDFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))

/*************************************************************************
* FIXME: mem access modes probably wrong                                *
*************************************************************************/

LOWFUNC(READ, WRITE, 0, raw_pushfl, (void))
{
    emit_byte(0x9c);
}
LENDFUNC(READ, WRITE, 0, raw_pushfl, (void))

LOWFUNC(WRITE, READ, 0, raw_popfl, (void))
{
    emit_byte(0x9d);
}
LENDFUNC(WRITE, READ, 0, raw_popfl, (void))

/* Generate floating-point instructions */
static inline void x86_fadd_m(MEMR s)
{
    emit_byte(0xdc);
    emit_byte(0x05);
    emit_long(s);
}

#endif

/*************************************************************************
* Unoptimizable stuff --- jump                                          *
*************************************************************************/

static __inline__ void raw_call_r(R4 r)
{
    #if USE_NEW_RTASM
    CALLsr(r);
    #else
    emit_byte(0xff);
    emit_byte(0xd0 + r);
    #endif
}

static __inline__ void raw_call_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m)
{
    #if USE_NEW_RTASM
    CALLsm(base, X86_NOREG, r, m);
    #else
    int mu;
    switch (m)
    {
        case 1: mu = 0; break;
        case 2: mu = 1; break;
        case 4: mu = 2; break;
        case 8: mu = 3; break;
        default: abort();
    }
    emit_byte(0xff);
    emit_byte(0x14);
    emit_byte(0x05 + 8 * r + 0x40 * mu);
    emit_long(base);
    #endif
}

static __inline__ void raw_jmp_r(R4 r)
{
    #if USE_NEW_RTASM
    JMPsr(r);
    #else
    emit_byte(0xff);
    emit_byte(0xe0 + r);
    #endif
}

static __inline__ void raw_jmp_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m)
{
    #if USE_NEW_RTASM
    JMPsm(base, X86_NOREG, r, m);
    #else
    int mu;
    switch (m)
    {
        case 1: mu = 0; break;
        case 2: mu = 1; break;
        case 4: mu = 2; break;
        case 8: mu = 3; break;
        default: abort();
    }
    emit_byte(0xff);
    emit_byte(0x24);
    emit_byte(0x05 + 8 * r + 0x40 * mu);
    emit_long(base);
    #endif
}

static __inline__ void raw_jmp_m(uae_u32 base)
{
    emit_byte(0xff);
    emit_byte(0x25);
    emit_long(base);
}


static __inline__ void raw_call(uae_u32 t)
{
    #if USE_NEW_RTASM
    CALLm(t);
    #else
    emit_byte(0xe8);
    emit_long(t - (uae_u32)target - 4);
    #endif
}

static __inline__ void raw_jmp(uae_u32 t)
{
    #if USE_NEW_RTASM
    JMPm(t);
    #else
    emit_byte(0xe9);
    emit_long(t - (uae_u32)target - 4);
    #endif
}

static __inline__ void raw_jl(uae_u32 t)
{
    emit_byte(0x0f);
    emit_byte(0x8c);
    emit_long(t - (uintptr)target - 4);
}

static __inline__ void raw_jz(uae_u32 t)
{
    emit_byte(0x0f);
    emit_byte(0x84);
    emit_long(t - (uintptr)target - 4);
}

static __inline__ void raw_jnz(uae_u32 t)
{
    emit_byte(0x0f);
    emit_byte(0x85);
    emit_long(t - (uintptr)target - 4);
}

static __inline__ void raw_jnz_l_oponly(void)
{
    emit_byte(0x0f);
    emit_byte(0x85);
}

static __inline__ void raw_jcc_l_oponly(int cc)
{
    emit_byte(0x0f);
    emit_byte(0x80 + cc);
}

static __inline__ void raw_jnz_b_oponly(void)
{
    emit_byte(0x75);
}

static __inline__ void raw_jz_b_oponly(void)
{
    emit_byte(0x74);
}

static __inline__ void raw_jcc_b_oponly(int cc)
{
    emit_byte(0x70 + cc);
}

static __inline__ void raw_jmp_l_oponly(void)
{
    emit_byte(0xe9);
}

static __inline__ void raw_jmp_b_oponly(void)
{
    emit_byte(0xeb);
}

static __inline__ void raw_ret(void)
{
    emit_byte(0xc3);
}

static __inline__ void raw_nop(void)
{
    emit_byte(0x90);
}

static __inline__ void raw_emit_nop_filler(int nbytes)
{
    /* Source: GNU Binutils 2.12.90.0.15 */
    /* Various efficient no-op patterns for aligning code labels.
       Note: Don't try to assemble the instructions in the comments.
       0L and 0w are not legal.  */
    static const uae_u8 f32_1[] =
    { 0x90 };                                 /* nop					*/
    static const uae_u8 f32_2[] =
    { 0x89, 0xf6 };                            /* movl %esi,%esi		*/
    static const uae_u8 f32_3[] =
    { 0x8d, 0x76, 0x00 };                       /* leal 0(%esi),%esi	*/
    static const uae_u8 f32_4[] =
    { 0x8d, 0x74, 0x26, 0x00 };                  /* leal 0(%esi,1),%esi	*/
    static const uae_u8 f32_5[] =
    { 0x90,                                 /* nop					*/
      0x8d, 0x74, 0x26, 0x00 };             /* leal 0(%esi,1),%esi	*/
    static const uae_u8 f32_6[] =
    { 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 };        /* leal 0L(%esi),%esi	*/
    static const uae_u8 f32_7[] =
    { 0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00 };   /* leal 0L(%esi,1),%esi */
    static const uae_u8 f32_8[] =
    { 0x90,                                 /* nop					*/
      0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%esi,1),%esi */
    static const uae_u8 f32_9[] =
    { 0x89, 0xf6,                           /* movl %esi,%esi		*/
      0x8d, 0xbc, 0x27, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%edi,1),%edi */
    static const uae_u8 f32_10[] =
    { 0x8d, 0x76, 0x00,                     /* leal 0(%esi),%esi	*/
      0x8d, 0xbc, 0x27, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%edi,1),%edi */
    static const uae_u8 f32_11[] =
    { 0x8d, 0x74, 0x26, 0x00,               /* leal 0(%esi,1),%esi	*/
      0x8d, 0xbc, 0x27, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%edi,1),%edi */
    static const uae_u8 f32_12[] =
    { 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00,   /* leal 0L(%esi),%esi	*/
      0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%edi),%edi	*/
    static const uae_u8 f32_13[] =
    { 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00,   /* leal 0L(%esi),%esi	*/
      0x8d, 0xbc, 0x27, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%edi,1),%edi */
    static const uae_u8 f32_14[] =
    { 0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00, /* leal 0L(%esi,1),%esi */
      0x8d, 0xbc, 0x27, 0x00, 0x00, 0x00, 0x00 }; /* leal 0L(%edi,1),%edi */
    static const uae_u8 f32_15[] =
    { 0xeb, 0x0d, 0x90, 0x90, 0x90, 0x90, 0x90, /* jmp .+15; lotsa nops	*/
      0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90 };
    static const uae_u8 f32_16[] =
    { 0xeb, 0x0d, 0x90, 0x90, 0x90, 0x90, 0x90, /* jmp .+15; lotsa nops	*/
      0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90 };
    static const uae_u8* const f32_patt[] = {
        f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
        f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
    };
    static const uae_u8 prefixes[4] = { 0x66, 0x66, 0x66, 0x66 };

    #if defined (__x86_64__)
    /* The recommended way to pad 64bit code is to use NOPs preceded by
       maximally four 0x66 prefixes.  Balance the size of nops.  */
    if (nbytes == 0)
        return;

    int i;
    int nnops = (nbytes + 3) / 4;
    int len = nbytes / nnops;
    int remains = nbytes - nnops * len;

    for (i = 0; i < remains; i++)
    {
        emit_block(prefixes, len);
        raw_nop();
    }
    for (; i < nnops; i++)
    {
        emit_block(prefixes, len - 1);
        raw_nop();
    }
    #else
    int nloops = nbytes / 16;
    while (nloops-- > 0)
        emit_block(f32_16, sizeof(f32_16));

    nbytes %= 16;
    if (nbytes)
        emit_block(f32_patt[nbytes - 1], nbytes);
    #endif
}


/*************************************************************************
* Flag handling, to and fro UAE flag register                           *
*************************************************************************/

static __inline__ void raw_flags_evicted(int r)
{
    //live.state[FLAGTMP].status=CLEAN;
    live.state[FLAGTMP].status = INMEM;
    live.state[FLAGTMP].realreg = -1;
    /* We just "evicted" FLAGTMP. */
    if (live.nat[r].nholds != 1)
    {
        /* Huh? */
        abort();
    }
    live.nat[r].nholds = 0;
}

#define FLAG_NREG1_FLAGREG 0  /* Set to -1 if any register will do */
static __inline__ void raw_flags_to_reg_FLAGREG(int r)
{
    raw_lahf(0); /* Most flags in AH */
    //raw_setcc(r,0); /* V flag in AL */
    raw_setcc_m((uintptr)live.state[FLAGTMP].mem, 0);

    #if 1 /* Let's avoid those nasty partial register stalls */
    //raw_mov_b_mr((uintptr)live.state[FLAGTMP].mem,r);
    raw_mov_b_mr(((uintptr)live.state[FLAGTMP].mem) + 1, AH_INDEX);
    raw_flags_evicted(r);
    #endif
}

#define FLAG_NREG2_FLAGREG 0  /* Set to -1 if any register will do */
static __inline__ void raw_reg_to_flags_FLAGREG(int r)
{
    raw_cmp_b_ri(r, -127); /* set V */
    raw_sahf(0);
}

#define FLAG_NREG3_FLAGREG 0  /* Set to -1 if any register will do */
static __inline__ void raw_flags_set_zero_FLAGREG(int s, int tmp)
{
    raw_mov_l_rr(tmp, s);
    raw_lahf(s); /* flags into ah */
    raw_and_l_ri(s, 0xffffbfff);
    raw_and_l_ri(tmp, 0x00004000);
    raw_xor_l_ri(tmp, 0x00004000);
    raw_or_l(s, tmp);
    raw_sahf(s);
}

static __inline__ void raw_flags_init_FLAGREG(void) {
}

#define FLAG_NREG1_FLAGSTK -1  /* Set to -1 if any register will do */
static __inline__ void raw_flags_to_reg_FLAGSTK(int r)
{
    raw_pushfl();
    raw_pop_l_r(r);
    raw_mov_l_mr((uintptr)live.state[FLAGTMP].mem, r);
    raw_flags_evicted(r);
}

#define FLAG_NREG2_FLAGSTK -1  /* Set to -1 if any register will do */
static __inline__ void raw_reg_to_flags_FLAGSTK(int r)
{
    raw_push_l_r(r);
    raw_popfl();
}

#define FLAG_NREG3_FLAGSTK -1  /* Set to -1 if any register will do */
static __inline__ void raw_flags_set_zero_FLAGSTK(int s, int tmp)
{
    raw_mov_l_rr(tmp, s);
    raw_pushfl();
    raw_pop_l_r(s);
    raw_and_l_ri(s, 0xffffffbf);
    raw_and_l_ri(tmp, 0x00000040);
    raw_xor_l_ri(tmp, 0x00000040);
    raw_or_l(s, tmp);
    raw_push_l_r(s);
    raw_popfl();
}

static __inline__ void raw_flags_init_FLAGSTK(void) {
}

#if defined (__x86_64__)
/* Try to use the LAHF/SETO method on x86_64 since it is faster.
   This can't be the default because some older CPUs don't support
   LAHF/SAHF in long mode.  */
static int FLAG_NREG1_FLAGGEN = 0;
static __inline__ void raw_flags_to_reg_FLAGGEN(int r)
{
    if (have_lahf_lm)
    {
        // NOTE: the interpreter uses the normal EFLAGS layout
        //   pushf/popf CF(0) ZF( 6) SF( 7) OF(11)
        //   sahf/lahf  CF(8) ZF(14) SF(15) OF( 0)
        assert(r == 0);
        raw_setcc(r, 0);                 /* V flag in AL */
        raw_lea_l_r_scaled(0, 0, 8);      /* move it to its EFLAGS location */
        raw_mov_b_mr(((uintptr)live.state[FLAGTMP].mem) + 1, 0);
        raw_lahf(0);                    /* most flags in AH */
        raw_mov_b_mr((uintptr)live.state[FLAGTMP].mem, AH_INDEX);
        raw_flags_evicted(r);
    }
    else
        raw_flags_to_reg_FLAGSTK(r);
}

static int FLAG_NREG2_FLAGGEN = 0;
static __inline__ void raw_reg_to_flags_FLAGGEN(int r)
{
    if (have_lahf_lm)
    {
        raw_xchg_b_rr(0, AH_INDEX);
        raw_cmp_b_ri(r, -120); /* set V */
        raw_sahf(0);
    }
    else
        raw_reg_to_flags_FLAGSTK(r);
}

static int FLAG_NREG3_FLAGGEN = 0;
static __inline__ void raw_flags_set_zero_FLAGGEN(int s, int tmp)
{
    if (have_lahf_lm)
        raw_flags_set_zero_FLAGREG(s, tmp);
    else
        raw_flags_set_zero_FLAGSTK(s, tmp);
}

static __inline__ void raw_flags_init_FLAGGEN(void)
{
    if (have_lahf_lm)
    {
        FLAG_NREG1_FLAGGEN = FLAG_NREG1_FLAGREG;
        FLAG_NREG2_FLAGGEN = FLAG_NREG2_FLAGREG;
        FLAG_NREG1_FLAGGEN = FLAG_NREG3_FLAGREG;
    }
    else
    {
        FLAG_NREG1_FLAGGEN = FLAG_NREG1_FLAGSTK;
        FLAG_NREG2_FLAGGEN = FLAG_NREG2_FLAGSTK;
        FLAG_NREG1_FLAGGEN = FLAG_NREG3_FLAGSTK;
    }
}
#endif

#ifdef SAHF_SETO_PROFITABLE
    #define FLAG_SUFFIX FLAGREG
#elif defined __x86_64__
    #define FLAG_SUFFIX FLAGGEN
#else
    #define FLAG_SUFFIX FLAGSTK
#endif

#define FLAG_GLUE_2(x, y)       x ## _ ## y
#define FLAG_GLUE_1(x, y)       FLAG_GLUE_2(x, y)
#define FLAG_GLUE(x)            FLAG_GLUE_1(x, FLAG_SUFFIX)

#define raw_flags_init FLAG_GLUE(raw_flags_init)
#define FLAG_NREG1 FLAG_GLUE(FLAG_NREG1)
#define raw_flags_to_reg FLAG_GLUE(raw_flags_to_reg)
#define FLAG_NREG2 FLAG_GLUE(FLAG_NREG2)
#define raw_reg_to_flags FLAG_GLUE(raw_reg_to_flags)
#define FLAG_NREG3 FLAG_GLUE(FLAG_NREG3)
#define raw_flags_set_zero FLAG_GLUE(raw_flags_set_zero)

/* Apparently, there are enough instructions between flag store and
   flag reload to avoid the partial memory stall */
static __inline__ void raw_load_flagreg(uae_u32 target, uae_u32 r)
{
    #if 1
    raw_mov_l_rm(target, (uintptr)live.state[r].mem);
    #else
    raw_mov_b_rm(target, (uintptr)live.state[r].mem);
    raw_mov_b_rm(target + 4, ((uintptr)live.state[r].mem) + 1);
    #endif
}

/* FLAGX is byte sized, and we *do* write it at that size */
static __inline__ void raw_load_flagx(uae_u32 target, uae_u32 r)
{
    if (live.nat[target].canbyte)
        raw_mov_b_rm(target, (uintptr)live.state[r].mem);
    else if (live.nat[target].canword)
        raw_mov_w_rm(target, (uintptr)live.state[r].mem);
    else
        raw_mov_l_rm(target, (uintptr)live.state[r].mem);
}

static __inline__ void raw_dec_sp(int off)
{
    if (off) raw_sub_l_ri(ESP_INDEX, off);
}

static __inline__ void raw_inc_sp(int off)
{
    if (off) raw_add_l_ri(ESP_INDEX, off);
}

/*************************************************************************
* Handling mistaken direct memory access                                *
*************************************************************************/

// gb-- I don't need that part for JIT Basilisk II
#if defined (NATMEM_OFFSET) && 0
    #include <asm/sigcontext.h>
    #include <signal.h>

    #define SIG_READ 1
    #define SIG_WRITE 2

static int in_handler = 0;
static uae_u8 veccode[256];

static void vec(int x, struct sigcontext sc)
{
    uae_u8* i = (uae_u8*)sc.eip;
    uae_u32 addr = sc.cr2;
    int r = -1;
    int size = 4;
    int dir = -1;
    int len = 0;
    int j;

    write_log("fault address is %08x at %08x\n", sc.cr2, sc.eip);
    if (!canbang)
        write_log("Not happy! Canbang is 0 in SIGSEGV handler!\n");
    if (in_handler)
        write_log("Argh --- Am already in a handler. Shouldn't happen!\n");

    if (canbang && i >= compiled_code && i <= current_compile_p)
    {
        if (*i == 0x66)
        {
            i++;
            size = 2;
            len++;
        }

        switch (i[0])
        {
            case 0x8a:
                if ((i[1] & 0xc0) == 0x80)
                {
                    r = (i[1] >> 3) & 7;
                    dir = SIG_READ;
                    size = 1;
                    len += 6;
                    break;
                }
                break;
            case 0x88:
                if ((i[1] & 0xc0) == 0x80)
                {
                    r = (i[1] >> 3) & 7;
                    dir = SIG_WRITE;
                    size = 1;
                    len += 6;
                    break;
                }
                break;
            case 0x8b:
                if ((i[1] & 0xc0) == 0x80)
                {
                    r = (i[1] >> 3) & 7;
                    dir = SIG_READ;
                    len += 6;
                    break;
                }
                if ((i[1] & 0xc0) == 0x40)
                {
                    r = (i[1] >> 3) & 7;
                    dir = SIG_READ;
                    len += 3;
                    break;
                }
                break;
            case 0x89:
                if ((i[1] & 0xc0) == 0x80)
                {
                    r = (i[1] >> 3) & 7;
                    dir = SIG_WRITE;
                    len += 6;
                    break;
                }
                if ((i[1] & 0xc0) == 0x40)
                {
                    r = (i[1] >> 3) & 7;
                    dir = SIG_WRITE;
                    len += 3;
                    break;
                }
                break;
        }
    }

    if (r != -1)
    {
        void* pr = NULL;
        write_log("register was %d, direction was %d, size was %d\n", r, dir, size);

        switch (r)
        {
            case 0: pr = &(sc.eax); break;
            case 1: pr = &(sc.ecx); break;
            case 2: pr = &(sc.edx); break;
            case 3: pr = &(sc.ebx); break;
            case 4: pr = (size > 1) ? NULL : (((uae_u8*)&(sc.eax)) + 1); break;
            case 5: pr = (size > 1) ?
                         (void*)(&(sc.ebp)) :
                         (void*)(((uae_u8*)&(sc.ecx)) + 1); break;
            case 6: pr = (size > 1) ?
                         (void*)(&(sc.esi)) :
                         (void*)(((uae_u8*)&(sc.edx)) + 1); break;
            case 7: pr = (size > 1) ?
                         (void*)(&(sc.edi)) :
                         (void*)(((uae_u8*)&(sc.ebx)) + 1); break;
            default: abort();
        }
        if (pr)
        {
            blockinfo* bi;

            if (currprefs.comp_oldsegv)
            {
                addr -= NATMEM_OFFSET;

                if ((addr >= 0x10000000 && addr < 0x40000000) ||
                    (addr >= 0x50000000))
                {
                    write_log("Suspicious address in %x SEGV handler.\n", addr);
                }
                if (dir == SIG_READ)
                {
                    switch (size)
                    {
                        case 1: *((uae_u8*)pr) = get_byte(addr); break;
                        case 2: *((uae_u16*)pr) = get_word(addr); break;
                        case 4: *((uae_u32*)pr) = get_long(addr); break;
                        default: abort();
                    }
                }
                else /* write */
                {
                    switch (size)
                    {
                        case 1: put_byte(addr, *((uae_u8*)pr)); break;
                        case 2: put_word(addr, *((uae_u16*)pr)); break;
                        case 4: put_long(addr, *((uae_u32*)pr)); break;
                        default: abort();
                    }
                }
                write_log("Handled one access!\n");
                fflush(stdout);
                segvcount++;
                sc.eip += len;
            }
            else
            {
                void* tmp = target;
                int i;
                uae_u8 vecbuf[5];

                addr -= NATMEM_OFFSET;

                if ((addr >= 0x10000000 && addr < 0x40000000) ||
                    (addr >= 0x50000000))
                {
                    write_log("Suspicious address in %x SEGV handler.\n", addr);
                }

                target = (uae_u8*)sc.eip;
                for (i = 0; i < 5; i++)
                    vecbuf[i] = target[i];
                emit_byte(0xe9);
                emit_long((uintptr)veccode - (uintptr)target - 4);
                write_log("Create jump to %p\n", veccode);

                write_log("Handled one access!\n");
                fflush(stdout);
                segvcount++;

                target = veccode;

                if (dir == SIG_READ)
                {
                    switch (size)
                    {
                        case 1: raw_mov_b_ri(r, get_byte(addr)); break;
                        case 2: raw_mov_w_ri(r, get_byte(addr)); break;
                        case 4: raw_mov_l_ri(r, get_byte(addr)); break;
                        default: abort();
                    }
                }
                else /* write */
                {
                    switch (size)
                    {
                        case 1: put_byte(addr, *((uae_u8*)pr)); break;
                        case 2: put_word(addr, *((uae_u16*)pr)); break;
                        case 4: put_long(addr, *((uae_u32*)pr)); break;
                        default: abort();
                    }
                }
                for (i = 0; i < 5; i++)
                    raw_mov_b_mi(sc.eip + i, vecbuf[i]);
                raw_mov_l_mi((uintptr) & in_handler, 0);
                emit_byte(0xe9);
                emit_long(sc.eip + len - (uintptr)target - 4);
                in_handler = 1;
                target = tmp;
            }
            bi = active;
            while (bi)
            {
                if (bi->handler &&
                    (uae_u8*)bi->direct_handler <= i &&
                    (uae_u8*)bi->nexthandler > i)
                {
                    write_log("deleted trigger (%p<%p<%p) %p\n",
                              bi->handler,
                              i,
                              bi->nexthandler,
                              bi->pc_p);
                    invalidate_block(bi);
                    raise_in_cl_list(bi);
                    set_special(0);
                    return;
                }
                bi = bi->next;
            }
            /* Not found in the active list. Might be a rom routine that
               is in the dormant list */
            bi = dormant;
            while (bi)
            {
                if (bi->handler &&
                    (uae_u8*)bi->direct_handler <= i &&
                    (uae_u8*)bi->nexthandler > i)
                {
                    write_log("deleted trigger (%p<%p<%p) %p\n",
                              bi->handler,
                              i,
                              bi->nexthandler,
                              bi->pc_p);
                    invalidate_block(bi);
                    raise_in_cl_list(bi);
                    set_special(0);
                    return;
                }
                bi = bi->next;
            }
            write_log("Huh? Could not find trigger!\n");
            return;
        }
    }
    write_log("Can't handle access!\n");
    for (j = 0; j < 10; j++)
    {
        write_log("instruction byte %2d is %02x\n", j, i[j]);
    }
    write_log("Please send the above info (starting at \"fault address\") to\n"
              "bmeyer@csse.monash.edu.au\n"
              "This shouldn't happen ;-)\n");
    fflush(stdout);
    signal(SIGSEGV, SIG_DFL);  /* returning here will cause a "real" SEGV */
}
#endif


/*************************************************************************
* Checking for CPU features                                             *
*************************************************************************/

struct cpuinfo_x86
{
    uae_u8 x86;             // CPU family
    uae_u8 x86_vendor;      // CPU vendor
    uae_u8 x86_processor;   // CPU canonical processor type
    uae_u8 x86_brand_id;    // CPU BrandID if supported, yield 0 otherwise
    uae_u32 x86_hwcap;
    uae_u8 x86_model;
    uae_u8 x86_mask;
    int cpuid_level;        // Maximum supported CPUID level, -1=no CPUID
    char x86_vendor_id[16];
};
struct cpuinfo_x86 cpuinfo;

enum
{
    X86_VENDOR_INTEL = 0,
    X86_VENDOR_CYRIX = 1,
    X86_VENDOR_AMD = 2,
    X86_VENDOR_UMC = 3,
    X86_VENDOR_NEXGEN = 4,
    X86_VENDOR_CENTAUR = 5,
    X86_VENDOR_RISE = 6,
    X86_VENDOR_TRANSMETA = 7,
    X86_VENDOR_NSC = 8,
    X86_VENDOR_UNKNOWN = 0xff
};

enum
{
    X86_PROCESSOR_I386,                     /* 80386 */
    X86_PROCESSOR_I486,                     /* 80486DX, 80486SX, 80486DX[24] */
    X86_PROCESSOR_PENTIUM,
    X86_PROCESSOR_PENTIUMPRO,
    X86_PROCESSOR_K6,
    X86_PROCESSOR_ATHLON,
    X86_PROCESSOR_PENTIUM4,
    X86_PROCESSOR_X86_64,
    X86_PROCESSOR_max
};

static const char* x86_processor_string_table[X86_PROCESSOR_max] = {
    "80386",
    "80486",
    "Pentium",
    "PentiumPro",
    "K6",
    "Athlon",
    "Pentium4",
    "x86-64"
};

static struct ptt
{
    const int align_loop;
    const int align_loop_max_skip;
    const int align_jump;
    const int align_jump_max_skip;
    const int align_func;
}
x86_alignments[X86_PROCESSOR_max] = {
    { 4, 3, 4, 3, 4 },
    { 16, 15, 16, 15, 16 },
    { 16, 7, 16, 7, 16 },
    { 16, 15, 16, 7, 16 },
    { 32, 7, 32, 7, 32 },
    { 16, 7, 16, 7, 16 },
    { 0, 0, 0, 0, 0 },
    { 16, 7, 16, 7, 16 }
};

static void
x86_get_cpu_vendor(struct cpuinfo_x86* c)
{
    char* v = c->x86_vendor_id;

    if (!strcmp(v, "GenuineIntel"))
        c->x86_vendor = X86_VENDOR_INTEL;
    else if (!strcmp(v, "AuthenticAMD"))
        c->x86_vendor = X86_VENDOR_AMD;
    else if (!strcmp(v, "CyrixInstead"))
        c->x86_vendor = X86_VENDOR_CYRIX;
    else if (!strcmp(v, "Geode by NSC"))
        c->x86_vendor = X86_VENDOR_NSC;
    else if (!strcmp(v, "UMC UMC UMC "))
        c->x86_vendor = X86_VENDOR_UMC;
    else if (!strcmp(v, "CentaurHauls"))
        c->x86_vendor = X86_VENDOR_CENTAUR;
    else if (!strcmp(v, "NexGenDriven"))
        c->x86_vendor = X86_VENDOR_NEXGEN;
    else if (!strcmp(v, "RiseRiseRise"))
        c->x86_vendor = X86_VENDOR_RISE;
    else if (!strcmp(v, "GenuineTMx86") ||
             !strcmp(v, "TransmetaCPU"))
        c->x86_vendor = X86_VENDOR_TRANSMETA;
    else
        c->x86_vendor = X86_VENDOR_UNKNOWN;
}

static void
cpuid(uae_u32 op, uae_u32* eax, uae_u32* ebx, uae_u32* ecx, uae_u32* edx)
{
    const int CPUID_SPACE = 4096;
    uae_u8* cpuid_space = (uae_u8*)vm_acquire(CPUID_SPACE);
    if (cpuid_space == VM_MAP_FAILED)
        abort();
    vm_protect(cpuid_space, CPUID_SPACE, VM_PAGE_READ | VM_PAGE_WRITE | VM_PAGE_EXECUTE);

    static uae_u32 s_op, s_eax, s_ebx, s_ecx, s_edx;
    uae_u8* tmp = get_target();

    s_op = op;
    set_target(cpuid_space);
    raw_push_l_r(0); /* eax */
    raw_push_l_r(1); /* ecx */
    raw_push_l_r(2); /* edx */
    raw_push_l_r(3); /* ebx */
    raw_mov_l_rm(0, (uintptr) & s_op);
    raw_cpuid(0);
    raw_mov_l_mr((uintptr) & s_eax, 0);
    raw_mov_l_mr((uintptr) & s_ebx, 3);
    raw_mov_l_mr((uintptr) & s_ecx, 1);
    raw_mov_l_mr((uintptr) & s_edx, 2);
    raw_pop_l_r(3);
    raw_pop_l_r(2);
    raw_pop_l_r(1);
    raw_pop_l_r(0);
    raw_ret();
    set_target(tmp);

    ((cpuop_func*)cpuid_space)(0);
    if (eax != NULL) *eax = s_eax;
    if (ebx != NULL) *ebx = s_ebx;
    if (ecx != NULL) *ecx = s_ecx;
    if (edx != NULL) *edx = s_edx;

    vm_release(cpuid_space, CPUID_SPACE);
}

static void
raw_init_cpu(void)
{
    struct cpuinfo_x86* c = &cpuinfo;

    /* Defaults */
    c->x86_processor = X86_PROCESSOR_max;
    c->x86_vendor = X86_VENDOR_UNKNOWN;
    c->cpuid_level = -1;            /* CPUID not detected */
    c->x86_model = c->x86_mask = 0; /* So far unknown... */
    c->x86_vendor_id[0] = '\0';     /* Unset */
    c->x86_hwcap = 0;

    /* Get vendor name */
    c->x86_vendor_id[12] = '\0';
    cpuid(0x00000000,
          (uae_u32*)&c->cpuid_level,
          (uae_u32*)&c->x86_vendor_id[0],
          (uae_u32*)&c->x86_vendor_id[8],
          (uae_u32*)&c->x86_vendor_id[4]);
    x86_get_cpu_vendor(c);

    /* Intel-defined flags: level 0x00000001 */
    c->x86_brand_id = 0;
    if (c->cpuid_level >= 0x00000001)
    {
        uae_u32 tfms, brand_id;
        cpuid(0x00000001, &tfms, &brand_id, NULL, &c->x86_hwcap);
        c->x86 = (tfms >> 8) & 15;
        if (c->x86 == 0xf)
            c->x86 += (tfms >> 20) & 0xff; /* extended family */
        c->x86_model = (tfms >> 4) & 15;
        if (c->x86_model == 0xf)
            c->x86_model |= (tfms >> 12) & 0xf0; /* extended model */
        c->x86_brand_id = brand_id & 0xff;
        c->x86_mask = tfms & 15;
    }
    else
    {
        /* Have CPUID level 0 only - unheard of */
        c->x86 = 4;
    }

    /* AMD-defined flags: level 0x80000001 */
    uae_u32 xlvl;
    cpuid(0x80000000, &xlvl, NULL, NULL, NULL);
    if ((xlvl & 0xffff0000) == 0x80000000)
    {
        if (xlvl >= 0x80000001)
        {
            uae_u32 features, extra_features;
            cpuid(0x80000001, NULL, NULL, &extra_features, &features);
            if (features & (1 << 29))
            {
                /* Assume x86-64 if long mode is supported */
                c->x86_processor = X86_PROCESSOR_X86_64;
            }
            if (extra_features & (1 << 0))
                have_lahf_lm = true;
        }
    }

    /* Canonicalize processor ID */
    switch (c->x86)
    {
        case 3:
            c->x86_processor = X86_PROCESSOR_I386;
            break;
        case 4:
            c->x86_processor = X86_PROCESSOR_I486;
            break;
        case 5:
            if (c->x86_vendor == X86_VENDOR_AMD)
                c->x86_processor = X86_PROCESSOR_K6;
            else
                c->x86_processor = X86_PROCESSOR_PENTIUM;
            break;
        case 6:
            if (c->x86_vendor == X86_VENDOR_AMD)
                c->x86_processor = X86_PROCESSOR_ATHLON;
            else
                c->x86_processor = X86_PROCESSOR_PENTIUMPRO;
            break;
        case 15:
            if (c->x86_processor == X86_PROCESSOR_max)
            {
                switch (c->x86_vendor)
                {
                    case X86_VENDOR_INTEL:
                        c->x86_processor = X86_PROCESSOR_PENTIUM4;
                        break;
                    case X86_VENDOR_AMD:
                        /* Assume a 32-bit Athlon processor if not in long mode */
                        c->x86_processor = X86_PROCESSOR_ATHLON;
                        break;
                }
            }
            break;
    }
    if (c->x86_processor == X86_PROCESSOR_max)
    {
        c->x86_processor = X86_PROCESSOR_I386;
        fprintf(stderr, "Error: unknown processor type, assuming i386\n");
        fprintf(stderr, "  Family  : %d\n", c->x86);
        fprintf(stderr, "  Model   : %d\n", c->x86_model);
        fprintf(stderr, "  Mask    : %d\n", c->x86_mask);
        fprintf(stderr, "  Vendor  : %s [%d]\n", c->x86_vendor_id, c->x86_vendor);
        if (c->x86_brand_id)
            fprintf(stderr, "  BrandID : %02x\n", c->x86_brand_id);
    }

    /* Have CMOV support? */
    have_cmov = c->x86_hwcap & (1 << 15);
    #if defined (__x86_64__)
    if (!have_cmov)
    {
        write_log("x86-64 implementations are bound to have CMOV!\n");
        abort();
    }
    #endif

    /* Can the host CPU suffer from partial register stalls? */
    have_rat_stall = (c->x86_vendor == X86_VENDOR_INTEL);
    #if 1
    /* It appears that partial register writes are a bad idea even on
       AMD K7 cores, even though they are not supposed to have the
       dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */
    if (c->x86_processor == X86_PROCESSOR_ATHLON)
        have_rat_stall = true;
    #endif

    /* Alignments */
    if (tune_alignment)
    {
        align_loops = x86_alignments[c->x86_processor].align_loop;
        align_jumps = x86_alignments[c->x86_processor].align_jump;
    }

    write_log("Max CPUID level=%d Processor is %s [%s]\n",
              c->cpuid_level, c->x86_vendor_id,
              x86_processor_string_table[c->x86_processor]);

    raw_flags_init();
}

static bool target_check_bsf(void)
{
    bool mismatch = false;
    for (int g_ZF = 0; g_ZF <= 1; g_ZF++)
    {
        for (int g_CF = 0; g_CF <= 1; g_CF++)
        {
            for (int g_OF = 0; g_OF <= 1; g_OF++)
            {
                for (int g_SF = 0; g_SF <= 1; g_SF++)
                {
                    for (int value = -1; value <= 1; value++)
                    {
                        unsigned long flags = (g_SF << 7) | (g_OF << 11) | (g_ZF << 6) | g_CF;
                        unsigned long tmp = value;
                        __asm__ __volatile__ ("push %0; popf; bsf %1,%1; pushf; pop %0"
                                              : "+r" (flags), "+r" (tmp) : : "cc");
                        int OF = (flags >> 11) & 1;
                        int SF = (flags >> 7) & 1;
                        int ZF = (flags >> 6) & 1;
                        int CF = flags & 1;
                        tmp = (value == 0);
                        if (ZF != tmp || SF != g_SF || OF != g_OF || CF != g_CF)
                            mismatch = true;
                    }
                }
            }
        }
    }
    if (mismatch)
        write_log("Target CPU defines all flags on BSF instruction\n");
    return !mismatch;
}


/*************************************************************************
* FPU stuff                                                             *
*************************************************************************/


static __inline__ void raw_fp_init(void)
{
    int i;

    for (i = 0; i < N_FREGS; i++)
        live.spos[i] = -2;
    live.tos = -1;  /* Stack is empty */
}

static __inline__ void raw_fp_cleanup_drop(void)
{
    #if 0
    /* using FINIT instead of popping all the entries.
       Seems to have side effects --- there is display corruption in
       Quake when this is used */
    if (live.tos > 1)
    {
        emit_byte(0x9b);
        emit_byte(0xdb);
        emit_byte(0xe3);
        live.tos = -1;
    }
    #endif
    while (live.tos >= 1)
    {
        emit_byte(0xde);
        emit_byte(0xd9);
        live.tos -= 2;
    }
    while (live.tos >= 0)
    {
        emit_byte(0xdd);
        emit_byte(0xd8);
        live.tos--;
    }
    raw_fp_init();
}

static __inline__ void make_tos(int r)
{
    int p, q;

    if (live.spos[r] < 0)   /* Register not yet on stack */
    {
        emit_byte(0xd9);
        emit_byte(0xe8); /* Push '1' on the stack, just to grow it */
        live.tos++;
        live.spos[r] = live.tos;
        live.onstack[live.tos] = r;
        return;
    }
    /* Register is on stack */
    if (live.tos == live.spos[r])
        return;
    p = live.spos[r];
    q = live.onstack[live.tos];

    emit_byte(0xd9);
    emit_byte(0xc8 + live.tos - live.spos[r]);  /* exchange it with top of stack */
    live.onstack[live.tos] = r;
    live.spos[r] = live.tos;
    live.onstack[p] = q;
    live.spos[q] = p;
}

static __inline__ void make_tos2(int r, int r2)
{
    int q;

    make_tos(r2); /* Put the reg that's supposed to end up in position2
                     on top */

    if (live.spos[r] < 0)   /* Register not yet on stack */
    {
        make_tos(r); /* This will extend the stack */
        return;
    }
    /* Register is on stack */
    emit_byte(0xd9);
    emit_byte(0xc9); /* Move r2 into position 2 */

    q = live.onstack[live.tos - 1];
    live.onstack[live.tos] = q;
    live.spos[q] = live.tos;
    live.onstack[live.tos - 1] = r2;
    live.spos[r2] = live.tos - 1;

    make_tos(r); /* And r into 1 */
}

static __inline__ int stackpos(int r)
{
    if (live.spos[r] < 0)
        abort();
    if (live.tos < live.spos[r])
    {
        printf("Looking for spos for fnreg %d\n", r);
        abort();
    }
    return live.tos - live.spos[r];
}

static __inline__ void usereg(int r)
{
    if (live.spos[r] < 0)
        make_tos(r);
}

/* This is called with one FP value in a reg *above* tos, which it will
   pop off the stack if necessary */
static __inline__ void tos_make(int r)
{
    if (live.spos[r] < 0)
    {
        live.tos++;
        live.spos[r] = live.tos;
        live.onstack[live.tos] = r;
        return;
    }
    emit_byte(0xdd);
    emit_byte(0xd8 + (live.tos + 1) - live.spos[r]);  /* store top of stack in reg,
                                                         and pop it*/
}

/* FP helper functions */
#if USE_NEW_RTASM
    #define DEFINE_OP(NAME, GEN) \
    static inline void raw_##NAME(uint32 m) \
    { \
        GEN(m, X86_NOREG, X86_NOREG, 1); \
    }
DEFINE_OP(fstl, FSTDm);
DEFINE_OP(fstpl, FSTPDm);
DEFINE_OP(fldl, FLDDm);
DEFINE_OP(fildl, FILDLm);
DEFINE_OP(fistl, FISTLm);
DEFINE_OP(flds, FLDSm);
DEFINE_OP(fsts, FSTSm);
DEFINE_OP(fstpt, FSTPTm);
DEFINE_OP(fldt, FLDTm);
#else
    #define DEFINE_OP(NAME, OP1, OP2) \
    static inline void raw_##NAME(uint32 m) \
    { \
        emit_byte(OP1); \
        emit_byte(OP2); \
        emit_long(m); \
    }
DEFINE_OP(fstl, 0xdd, 0x15);
DEFINE_OP(fstpl, 0xdd, 0x1d);
DEFINE_OP(fldl, 0xdd, 0x05);
DEFINE_OP(fildl, 0xdb, 0x05);
DEFINE_OP(fistl, 0xdb, 0x15);
DEFINE_OP(flds, 0xd9, 0x05);
DEFINE_OP(fsts, 0xd9, 0x15);
DEFINE_OP(fstpt, 0xdb, 0x3d);
DEFINE_OP(fldt, 0xdb, 0x2d);
#endif
#undef DEFINE_OP

LOWFUNC(NONE, WRITE, 2, raw_fmov_mr, (MEMW m, FR r))
{
    make_tos(r);
    raw_fstl(m);
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_mr, (MEMW m, FR r))

LOWFUNC(NONE, WRITE, 2, raw_fmov_mr_drop, (MEMW m, FR r))
{
    make_tos(r);
    raw_fstpl(m);
    live.onstack[live.tos] = -1;
    live.tos--;
    live.spos[r] = -2;
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_mr, (MEMW m, FR r))

LOWFUNC(NONE, READ, 2, raw_fmov_rm, (FW r, MEMR m))
{
    raw_fldl(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmov_rm, (FW r, MEMR m))

LOWFUNC(NONE, READ, 2, raw_fmovi_rm, (FW r, MEMR m))
{
    raw_fildl(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmovi_rm, (FW r, MEMR m))

LOWFUNC(NONE, WRITE, 2, raw_fmovi_mr, (MEMW m, FR r))
{
    make_tos(r);
    raw_fistl(m);
}
LENDFUNC(NONE, WRITE, 2, raw_fmovi_mr, (MEMW m, FR r))

LOWFUNC(NONE, READ, 2, raw_fmovs_rm, (FW r, MEMR m))
{
    raw_flds(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmovs_rm, (FW r, MEMR m))

LOWFUNC(NONE, WRITE, 2, raw_fmovs_mr, (MEMW m, FR r))
{
    make_tos(r);
    raw_fsts(m);
}
LENDFUNC(NONE, WRITE, 2, raw_fmovs_mr, (MEMW m, FR r))

LOWFUNC(NONE, WRITE, 2, raw_fmov_ext_mr, (MEMW m, FR r))
{
    int rs;

    /* Stupid x87 can't write a long double to mem without popping the
       stack! */
    usereg(r);
    rs = stackpos(r);
    emit_byte(0xd9);     /* Get a copy to the top of stack */
    emit_byte(0xc0 + rs);

    raw_fstpt(m);   /* store and pop it */
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_ext_mr, (MEMW m, FR r))

LOWFUNC(NONE, WRITE, 2, raw_fmov_ext_mr_drop, (MEMW m, FR r))
{
    int rs;

    make_tos(r);
    raw_fstpt(m);   /* store and pop it */
    live.onstack[live.tos] = -1;
    live.tos--;
    live.spos[r] = -2;
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_ext_mr, (MEMW m, FR r))

LOWFUNC(NONE, READ, 2, raw_fmov_ext_rm, (FW r, MEMR m))
{
    raw_fldt(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmov_ext_rm, (FW r, MEMR m))

LOWFUNC(NONE, NONE, 1, raw_fmov_pi, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xeb);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_pi, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_log10_2, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xec);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_log10_2, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_log2_e, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xea);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_log2_e, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_loge_2, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xed);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_loge_2, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_1, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xe8);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_1, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_0, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xee);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_0, (FW r))

LOWFUNC(NONE, NONE, 2, raw_fmov_rr, (FW d, FR s))
{
    int ds;

    usereg(s);
    ds = stackpos(s);
    if (ds == 0 && live.spos[d] >= 0)
    {
        /* source is on top of stack, and we already have the dest */
        int dd = stackpos(d);
        emit_byte(0xdd);
        emit_byte(0xd0 + dd);
    }
    else
    {
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source on tos */
        tos_make(d); /* store to destination, pop if necessary */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fmov_rr, (FW d, FR s))

LOWFUNC(NONE, READ, 4, raw_fldcw_m_indexed, (R4 index, IMM base))
{
    emit_byte(0xd9);
    emit_byte(0xa8 + index);
    emit_long(base);
}
LENDFUNC(NONE, READ, 4, raw_fldcw_m_indexed, (R4 index, IMM base))


LOWFUNC(NONE, NONE, 2, raw_fsqrt_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        usereg(s);
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xfa); /* take square root */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xfa); /* take square root */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fsqrt_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fabs_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        usereg(s);
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xe1); /* take fabs */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xe1); /* take fabs */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fabs_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_frndint_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        usereg(s);
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xfc); /* take frndint */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xfc); /* take frndint */
    }
}
LENDFUNC(NONE, NONE, 2, raw_frndint_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fcos_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        usereg(s);
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xff); /* take cos */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xff); /* take cos */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fcos_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fsin_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        usereg(s);
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xfe); /* take sin */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xfe); /* take sin */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fsin_rr, (FW d, FR s))

static const double one = 1;
LOWFUNC(NONE, NONE, 2, raw_ftwotox_rr, (FW d, FR s))
{
    int ds;

    usereg(s);
    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* duplicate source */

    emit_byte(0xd9);
    emit_byte(0xc0);  /* duplicate top of stack. Now up to 8 high */
    emit_byte(0xd9);
    emit_byte(0xfc);  /* rndint */
    emit_byte(0xd9);
    emit_byte(0xc9);  /* swap top two elements */
    emit_byte(0xd8);
    emit_byte(0xe1);  /* subtract rounded from original */
    emit_byte(0xd9);
    emit_byte(0xf0);  /* f2xm1 */
    x86_fadd_m((uintptr) & one);  /* Add '1' without using extra stack space */
    emit_byte(0xd9);
    emit_byte(0xfd);  /* and scale it */
    emit_byte(0xdd);
    emit_byte(0xd9);  /* take he rounded value off */
    tos_make(d); /* store to destination */
}
LENDFUNC(NONE, NONE, 2, raw_ftwotox_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fetox_rr, (FW d, FR s))
{
    int ds;

    usereg(s);
    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* duplicate source */
    emit_byte(0xd9);
    emit_byte(0xea);   /* fldl2e */
    emit_byte(0xde);
    emit_byte(0xc9);  /* fmulp --- multiply source by log2(e) */

    emit_byte(0xd9);
    emit_byte(0xc0);  /* duplicate top of stack. Now up to 8 high */
    emit_byte(0xd9);
    emit_byte(0xfc);  /* rndint */
    emit_byte(0xd9);
    emit_byte(0xc9);  /* swap top two elements */
    emit_byte(0xd8);
    emit_byte(0xe1);  /* subtract rounded from original */
    emit_byte(0xd9);
    emit_byte(0xf0);  /* f2xm1 */
    x86_fadd_m((uintptr) & one);  /* Add '1' without using extra stack space */
    emit_byte(0xd9);
    emit_byte(0xfd);  /* and scale it */
    emit_byte(0xdd);
    emit_byte(0xd9);  /* take he rounded value off */
    tos_make(d); /* store to destination */
}
LENDFUNC(NONE, NONE, 2, raw_fetox_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_flog2_rr, (FW d, FR s))
{
    int ds;

    usereg(s);
    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* duplicate source */
    emit_byte(0xd9);
    emit_byte(0xe8); /* push '1' */
    emit_byte(0xd9);
    emit_byte(0xc9); /* swap top two */
    emit_byte(0xd9);
    emit_byte(0xf1); /* take 1*log2(x) */
    tos_make(d); /* store to destination */
}
LENDFUNC(NONE, NONE, 2, raw_flog2_rr, (FW d, FR s))


LOWFUNC(NONE, NONE, 2, raw_fneg_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        usereg(s);
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xe0); /* take fchs */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xe0); /* take fchs */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fneg_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fadd_rr, (FRW d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xc0 + ds); /* add source to dest*/
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xc0 + ds); /* add source to dest*/
    }
}
LENDFUNC(NONE, NONE, 2, raw_fadd_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fsub_rr, (FRW d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xe8 + ds); /* sub source from dest*/
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xe0 + ds); /* sub src from dest */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fsub_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fcmp_rr, (FR d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    make_tos(d);
    ds = stackpos(s);

    emit_byte(0xdd);
    emit_byte(0xe0 + ds); /* cmp dest with source*/
}
LENDFUNC(NONE, NONE, 2, raw_fcmp_rr, (FR d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fmul_rr, (FRW d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xc8 + ds); /* mul dest by source*/
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xc8 + ds); /* mul dest by source*/
    }
}
LENDFUNC(NONE, NONE, 2, raw_fmul_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fdiv_rr, (FRW d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xf8 + ds); /* div dest by source */
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xf0 + ds); /* div dest by source*/
    }
}
LENDFUNC(NONE, NONE, 2, raw_fdiv_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_frem_rr, (FRW d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    make_tos2(d, s);
    ds = stackpos(s);

    if (ds != 1)
    {
        printf("Failed horribly in raw_frem_rr! ds is %d\n", ds);
        abort();
    }
    emit_byte(0xd9);
    emit_byte(0xf8); /* take rem from dest by source */
}
LENDFUNC(NONE, NONE, 2, raw_frem_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_frem1_rr, (FRW d, FR s))
{
    int ds;

    usereg(s);
    usereg(d);

    make_tos2(d, s);
    ds = stackpos(s);

    if (ds != 1)
    {
        printf("Failed horribly in raw_frem1_rr! ds is %d\n", ds);
        abort();
    }
    emit_byte(0xd9);
    emit_byte(0xf5); /* take rem1 from dest by source */
}
LENDFUNC(NONE, NONE, 2, raw_frem1_rr, (FRW d, FR s))


LOWFUNC(NONE, NONE, 1, raw_ftst_r, (FR r))
{
    make_tos(r);
    emit_byte(0xd9);  /* ftst */
    emit_byte(0xe4);
}
LENDFUNC(NONE, NONE, 1, raw_ftst_r, (FR r))

/* %eax register is clobbered if target processor doesn't support fucomi */
#define FFLAG_NREG_CLOBBER_CONDITION !have_cmov
#define FFLAG_NREG EAX_INDEX

static __inline__ void raw_fflags_into_flags(int r)
{
    int p;

    usereg(r);
    p = stackpos(r);

    emit_byte(0xd9);
    emit_byte(0xee); /* Push 0 */
    emit_byte(0xd9);
    emit_byte(0xc9 + p); /* swap top two around */
    if (have_cmov)
    {
        // gb-- fucomi is for P6 cores only, not K6-2 then...
        emit_byte(0xdb);
        emit_byte(0xe9 + p); /* fucomi them */
    }
    else
    {
        emit_byte(0xdd);
        emit_byte(0xe1 + p); /* fucom them */
        emit_byte(0x9b);
        emit_byte(0xdf);
        emit_byte(0xe0); /* fstsw ax */
        raw_sahf(0); /* sahf */
    }
    emit_byte(0xdd);
    emit_byte(0xd9 + p);  /* store value back, and get rid of 0 */
}